def classify(self, text): """Classify some text using the result of the CategoryPredictor MRJob. We use a basic naive-bayes model, eg, argmax_category p(category) * p(words | category) == p(category) * pi_{i \in words} p(word_i | category). p(category) is stored in self.category_prob, p(word | category is in self.word_given_cat_prob. """ # start with prob(category) lg_scores = self.category_prob.copy() # then multiply in the individual word probabilities # NOTE: we're actually adding here, but that's because our # distributions are made up of log probabilities, which are # more accurate for small probabilities. See # http://en.wikipedia.org/wiki/Log_probability for more # details. for word in category_predictor.words(text): for cat in lg_scores: cat_probs = self.word_given_cat_prob[cat] if word in cat_probs: lg_scores[cat] += cat_probs[word] else: lg_scores[cat] += cat_probs['UNK'] # convert scores to a non-log value scores = dict((cat, math.exp(score)) for cat, score in lg_scores.iteritems()) # normalize the scores again - this isnt' strictly necessary, # but it's nice to report probabilities with our guesses total = sum(scores.itervalues()) return dict((cat, prob / total) for cat, prob in scores.iteritems())
def classify(self, text): """Classify some text using the result of the CategoryPredictor MRJob. We use a basic naive-bayes model, eg, argmax_category p(category) * p(words | category) == p(category) * pi_{i \in words} p(word_i | category). p(category) is stored in self.category_prob, p(word | category is in self.word_given_cat_prob. """ # start with prob(category) lg_scores = self.category_prob.copy() # then multiply in the individual word probabilities # NOTE: we're actually adding here, but that's because our # distributions are made up of log probabilities, which are # more accurate for small probabilities. See # http://en.wikipedia.org/wiki/Log_probability for more # details. for word in category_predictor.words(text): for cat in lg_scores: cat_probs = self.word_given_cat_prob[cat] if word in cat_probs: lg_scores[cat] += cat_probs[word] else: lg_scores[cat] += cat_probs['UNK'] # convert scores to a non-log value scores = dict( (cat, math.exp(score)) for cat, score in lg_scores.iteritems()) # normalize the scores again - this isnt' strictly necessary, # but it's nice to report probabilities with our guesses total = sum(scores.itervalues()) return dict((cat, prob / total) for cat, prob in scores.iteritems())
def classify(self, text): lg_scores = self.category_prob.copy() for word in category_predictor.words(text): for cat in lg_scores: cat_probs = self.word_given_cat_prob[cat] if word in cat_probs: lg_scores[cat] += cat_probs[word] else: lg_scores[cat] += cat_probs['UNK'] scores = dict((cat, math.exp(score)) for cat, score in lg_scores.iteritems()) total = sum(scores.itervalues()) return dict((cat, prob / total) for cat, prob in scores.iteritems())