class NaiveBayes(Classifier): """ Implements a classifier that uses the Bayes' theorem. """ def learn(self): # Frequency count of target classes self.C = OnlineLogProbability() # Frequency count of P(Fi|C): self.Fi = defaultdict( lambda: # For each class, defaultdict(lambda: # For each attribute, OnlineLogProbability())) # For each value, count it for example in self.dataset: class_ = self.target(example) self.C.add(class_) for attribute in self.attributes: value = attribute(example) self.Fi[class_][attribute].add(value) if not self.C: raise ValueError("Dataset is empty") # Cripple defaultdict to a regular dict, so now it can rasie KeyError self.Fi.default_factory = None for d in list(self.Fi.values()): d.default_factory = None def classify(self, example): values = [(attribute, attribute(example)) for attribute in self.attributes] hypotheses = [] for class_ in self.C: try: ps = [self.Fi[class_][attr][val] for attr, val in values] except KeyError: continue # A value not seen in training, so Prob(class) == 0 ps.append(self.C[class_]) hypotheses.append((sum(ps), class_)) if hypotheses: logprob, best = max(hypotheses) Z = numpy.logaddexp.reduce([p for p, class_ in hypotheses]) logprob = logprob - Z else: # Something not at all seen in training, return best a priori logprob, best = max((p, class_) for class_, p in self.C.items()) p = numpy.exp(logprob) assert 0.0 <= p and p <= 1.0 return best, p
class NaiveBayes(Classifier): def __init__(self, dataset, problem): super(NaiveBayes, self).__init__(dataset, problem) # Frequency count of target classes self.C = OnlineLogProbability() # Frequency count of P(Fi|C): self.Fi = defaultdict(lambda: # For each class, defaultdict(lambda: # For each attribute, OnlineLogProbability())) # For each value, count it for example in dataset: class_ = self.target(example) self.C.add(class_) for attribute in self.attributes: value = attribute(example) self.Fi[class_][attribute].add(value) if not self.C: raise ValueError("Dataset is empty") # Cripple defaultdict to a regular dict, so now it can rasie KeyError self.Fi.default_factory = None for d in self.Fi.itervalues(): d.default_factory = None def classify(self, example): values = [(attribute, attribute(example)) for attribute in self.attributes] hypotheses = [] for class_ in self.C: try: ps = [self.Fi[class_][attr][val] for attr, val in values] except KeyError: continue # A value not seen in training, so Prob(class) == 0 ps.append(self.C[class_]) hypotheses.append((sum(ps), class_)) if hypotheses: logprob, best = max(hypotheses) Z = numpy.logaddexp.reduce([p for p, class_ in hypotheses]) logprob = logprob - Z else: # Something not at all seen in training, return best a priori logprob, best = max((p, class_) for class_, p in self.C.items()) return best, numpy.exp(logprob)