def most_informative_features(self, n=100): """ Return a list of the 'most informative' features used by this classifier. For the purpose of this function, the informativeness of a feature C{(fname,fval)} is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label:: max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ # The set of (fname, fval) pairs used by this classifier. features = set() # The max & min probability associated w/ each (fname, fval) # pair. Maps (fname,fval) -> float. maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) for (label, fname), probdist in self._feature_probdist.items(): for fval in probdist.samples(): feature = (fname, fval) features.add( feature ) p = probdist.prob(fval) maxprob[feature] = max(p, maxprob[feature]) minprob[feature] = min(p, minprob[feature]) if minprob[feature] == 0: features.discard(feature) # Convert features to a list, & sort it by how informative # features are. features = sorted(features, key=lambda feature: minprob[feature]/maxprob[feature]) return features[:n]
def train(labeled_featuresets, estimator=ELEProbDist): """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occured, given # the label and featurename. for featureset, label in labeled_featuresets: label_freqdist.inc(label) for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() feature_freqdist[label, fname].inc(None, num_samples-count) feature_values[fname].add(None) # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label,fname] = probdist return NaiveBayesClassifier(label_probdist, feature_probdist)
def invert_dict(d): from nltk.compat import defaultdict inverted_dict = defaultdict(list) for key in d: if hasattr(d[key], '__iter__'): for term in d[key]: inverted_dict[term].append(key) else: inverted_dict[d[key]] = key return inverted_dict
def stump(feature_name, labeled_featuresets): label = FreqDist([label for (featureset, label) in labeled_featuresets]).max() # Find the best label for each value. freqs = defaultdict(FreqDist) # freq(label|value) for featureset, label in labeled_featuresets: feature_value = featureset[feature_name] freqs[feature_value].inc(label) decisions = dict([(val, DecisionTreeClassifier(freqs[val].max())) for val in freqs]) return DecisionTreeClassifier(label, feature_name, decisions)
def train( labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False, feature_values=None, verbose=False, ): """ @param binary: If true, then treat all feature/value pairs a individual binary features, rather than using a single n-way branch for each feature. """ # Collect a list of all feature names. feature_names = set() for featureset, label in labeled_featuresets: for fname in featureset: feature_names.add(fname) # Collect a list of the values each feature can take. if feature_values is None and binary: feature_values = defaultdict(set) for featureset, label in labeled_featuresets: for fname, fval in featureset.items(): feature_values[fname].add(fval) # Start with a stump. if not binary: tree = DecisionTreeClassifier.best_stump(feature_names, labeled_featuresets, verbose) else: tree = DecisionTreeClassifier.best_binary_stump(feature_names, labeled_featuresets, feature_values, verbose) # Refine the stump. tree.refine( labeled_featuresets, entropy_cutoff, depth_cutoff - 1, support_cutoff, binary, feature_values, verbose ) # Return it return tree