Ejemplo n.º 1
0
class NaiveBayesClassifier(object):

    def __init__(self):
        self.label_prob = ProbDist()
        self.label_feat_prob = defaultdict(ProbDist)
        self._fmap = FeatureMap()

    def print_features(self, label):
        for fid in self._fmap.keys():
            if (label, fid) in self.label_feat_prob:
                pdist = self.label_feat_prob[label, fid]
                for val in pdist.keys():
                    print "{0} {1} {2} {3}".format(label, fid, val,
                                                   pdist.prob(val))

    def train(self, label_features_set):
        label_freq_map = FrequencyMap()
        label_fid_freq_map = defaultdict(FrequencyMap)
        for label, features in label_features_set:
            # Count how often the label occurs.
            label_freq_map.inc(label)
            # Count how often a value for a given feature and label occurs.
            for f, val in features.iteritems():
                label_fid_freq_map[label, f].inc(val)
                # Record that we have seen that feature (for some label).
                self._fmap.get(f)

        for label in label_freq_map.keys():
            num_count = label_freq_map.freq(label)
            for f in self._fmap.keys():
                count = label_fid_freq_map[label, f].total()
                label_fid_freq_map[label, f].inc(None, num_count - count)

        # Compute the probabilities.
        for label in label_freq_map.keys():
            # P(label)
            probability = float(label_freq_map.freq(label))/label_freq_map.total()
            self.label_prob.set(label, probability)

        for ((label, f), freqmap) in label_fid_freq_map.items():
             for val in freqmap.keys():
                 p = float(freqmap.freq(val)) / float(freqmap.total())
                 self.label_feat_prob[label, f].set(val, p)

    def classify(self, features):
        features_copy = features.copy()
        for f in features_copy.keys():
            if f not in self._fmap.keys():
                del features_copy[f]
        prob_dist = ProbDist()

        for label in self.label_prob.keys():
            prob_dist.set(label, self.label_prob.logprob(label))
            for feat, val in features_copy.iteritems():
                if (label, feat) not in self.label_feat_prob:
                    print "ERROR"
                p_dist = self.label_feat_prob[label, feat]
                prob_dist.inc(label, p_dist.logprob(val))
        return prob_dist
Ejemplo n.º 2
0
    def classify(self, features):
        features_copy = features.copy()
        for f in features_copy.keys():
            if f not in self._fmap.keys():
                del features_copy[f]
        prob_dist = ProbDist()

        for label in self.label_prob.keys():
            prob_dist.set(label, self.label_prob.logprob(label))
            for feat, val in features_copy.iteritems():
                if (label, feat) not in self.label_feat_prob:
                    print "ERROR"
                p_dist = self.label_feat_prob[label, feat]
                prob_dist.inc(label, p_dist.logprob(val))
        return prob_dist
Ejemplo n.º 3
0
 def __init__(self):
     self.label_prob = ProbDist()
     self.label_feat_prob = defaultdict(ProbDist)
     self._fmap = FeatureMap()