def prob_classify(self, featureset): """Calculate the probabilities the given featureset classifications and return a DictionaryProbDist instance. Works in O(nm) with n = # of labels, m = # of featureset elements. """ # Work on a copy of the feature set, because we mutate it. fset = featureset.copy() for fname in featureset: for label in self._labels: if (label, fname) in self._feature_probdist: break else: # Discard feature name we haven't been trained on from the # input set. del fset[fname] # Now we're working with a feature set that only includes known # features. # Instead of working with the product of the separate probabilities, # we use the sum of the logarithms to prevent underflows and make the # result more stable. #: The probability of each label, given the features. Starting with #: the probability of the label itself. logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) # Add the logarithmic probability of the features given the labels. for label in self._labels: for (fname, fval) in fset.items(): feature_probs = self._feature_probdist.get((label, fname)) if feature_probs is not None: logprob[label] += feature_probs.logprob(fval) else: # This should not occur if the classifier was created with # the train() method. logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True)
def test_sum_logs_ninf(self): from twentiment.thirdparty.probability import sum_logs self.assertEqual(sum_logs([]), _NINF)