Example #1
0
    def most_informative_features(self, n=100):
        """
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature C{(fname,fval)} is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label::

          max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        """
        # The set of (fname, fval) pairs used by this classifier.
        features = set()
        # The max & min probability associated w/ each (fname, fval)
        # pair.  Maps (fname,fval) -> float.
        maxprob = defaultdict(lambda: 0.0)
        minprob = defaultdict(lambda: 1.0)

        for (label, fname), probdist in self._feature_probdist.items():
            for fval in probdist.samples():
                feature = (fname, fval)
                features.add( feature )
                p = probdist.prob(fval)
                maxprob[feature] = max(p, maxprob[feature])
                minprob[feature] = min(p, minprob[feature])
                if minprob[feature] == 0:
                    features.discard(feature)

        # Convert features to a list, & sort it by how informative
        # features are.
        features = sorted(features, 
            key=lambda feature: minprob[feature]/maxprob[feature])
        return features[:n]
Example #2
0
    def train(labeled_featuresets, estimator=ELEProbDist):
        """
        @param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples C{(featureset, label)}.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        # Count up how many times each feature value occured, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:
            label_freqdist.inc(label)
            for fname, fval in featureset.items():
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname].inc(fval)
                # Record that fname can take the value fval.
                feature_values[fname].add(fval)
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then
        # we assume that it gets the implicit value 'None.'  This loop
        # counts up the number of 'missing' feature values for each
        # (label,fname) pair, and increments the count of the fval
        # 'None' by that amount.
        for label in label_freqdist:
            num_samples = label_freqdist[label]
            for fname in fnames:
                count = feature_freqdist[label, fname].N()
                feature_freqdist[label, fname].inc(None, num_samples-count)
                feature_values[fname].add(None)

        # Create the P(label) distribution
        label_probdist = estimator(label_freqdist)

        # Create the P(fval|label, fname) distribution
        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))
            feature_probdist[label,fname] = probdist

        return NaiveBayesClassifier(label_probdist, feature_probdist)
Example #3
0
def invert_dict(d):
    from nltk.compat import defaultdict
    inverted_dict = defaultdict(list)
    for key in d:
        if hasattr(d[key], '__iter__'):
            for term in d[key]:
                inverted_dict[term].append(key)
        else:
            inverted_dict[d[key]] = key
    return inverted_dict
Example #4
0
    def stump(feature_name, labeled_featuresets):
        label = FreqDist([label for (featureset, label) in labeled_featuresets]).max()

        # Find the best label for each value.
        freqs = defaultdict(FreqDist)  # freq(label|value)
        for featureset, label in labeled_featuresets:
            feature_value = featureset[feature_name]
            freqs[feature_value].inc(label)

        decisions = dict([(val, DecisionTreeClassifier(freqs[val].max())) for val in freqs])
        return DecisionTreeClassifier(label, feature_name, decisions)
Example #5
0
    def train(
        labeled_featuresets,
        entropy_cutoff=0.05,
        depth_cutoff=100,
        support_cutoff=10,
        binary=False,
        feature_values=None,
        verbose=False,
    ):
        """
        @param binary: If true, then treat all feature/value pairs a
        individual binary features, rather than using a single n-way
        branch for each feature.
        """
        # Collect a list of all feature names.
        feature_names = set()
        for featureset, label in labeled_featuresets:
            for fname in featureset:
                feature_names.add(fname)

        # Collect a list of the values each feature can take.
        if feature_values is None and binary:
            feature_values = defaultdict(set)
            for featureset, label in labeled_featuresets:
                for fname, fval in featureset.items():
                    feature_values[fname].add(fval)

        # Start with a stump.
        if not binary:
            tree = DecisionTreeClassifier.best_stump(feature_names, labeled_featuresets, verbose)
        else:
            tree = DecisionTreeClassifier.best_binary_stump(feature_names, labeled_featuresets, feature_values, verbose)

        # Refine the stump.
        tree.refine(
            labeled_featuresets, entropy_cutoff, depth_cutoff - 1, support_cutoff, binary, feature_values, verbose
        )

        # Return it
        return tree