Ejemplo n.º 1
0
	def classify(self, text):
		"""Classify some text using the result of the
		CategoryPredictor MRJob. We use a basic naive-bayes model,
		eg, argmax_category p(category) * p(words | category) ==
		p(category) * pi_{i \in words} p(word_i | category).

		p(category) is stored in self.category_prob, p(word | category
		is in self.word_given_cat_prob.
		"""
		# start with prob(category)
		lg_scores = self.category_prob.copy()

		# then multiply in the individual word probabilities
		# NOTE: we're actually adding here, but that's because our
		# distributions are made up of log probabilities, which are
		# more accurate for small probabilities. See
		# http://en.wikipedia.org/wiki/Log_probability for more
		# details.
		for word in category_predictor.words(text):
			for cat in lg_scores:
				cat_probs = self.word_given_cat_prob[cat]

				if word in cat_probs:
					lg_scores[cat] += cat_probs[word]
				else:
					lg_scores[cat] += cat_probs['UNK']

		# convert scores to a non-log value
		scores = dict((cat, math.exp(score)) for cat, score in lg_scores.iteritems())

		# normalize the scores again - this isnt' strictly necessary,
		# but it's nice to report probabilities with our guesses
		total = sum(scores.itervalues())
		return dict((cat, prob / total) for cat, prob in scores.iteritems())
    def classify(self, text):
        """Classify some text using the result of the
		CategoryPredictor MRJob. We use a basic naive-bayes model,
		eg, argmax_category p(category) * p(words | category) ==
		p(category) * pi_{i \in words} p(word_i | category).

		p(category) is stored in self.category_prob, p(word | category
		is in self.word_given_cat_prob.
		"""
        # start with prob(category)
        lg_scores = self.category_prob.copy()

        # then multiply in the individual word probabilities
        # NOTE: we're actually adding here, but that's because our
        # distributions are made up of log probabilities, which are
        # more accurate for small probabilities. See
        # http://en.wikipedia.org/wiki/Log_probability for more
        # details.
        for word in category_predictor.words(text):
            for cat in lg_scores:
                cat_probs = self.word_given_cat_prob[cat]

                if word in cat_probs:
                    lg_scores[cat] += cat_probs[word]
                else:
                    lg_scores[cat] += cat_probs['UNK']

        # convert scores to a non-log value
        scores = dict(
            (cat, math.exp(score)) for cat, score in lg_scores.iteritems())

        # normalize the scores again - this isnt' strictly necessary,
        # but it's nice to report probabilities with our guesses
        total = sum(scores.itervalues())
        return dict((cat, prob / total) for cat, prob in scores.iteritems())
Ejemplo n.º 3
0
	def classify(self, text):

		lg_scores = self.category_prob.copy()

		for word in category_predictor.words(text):
			for cat in lg_scores:
				cat_probs = self.word_given_cat_prob[cat]

				if word in cat_probs:
					lg_scores[cat] += cat_probs[word]
				else:
					lg_scores[cat] += cat_probs['UNK']


		scores = dict((cat, math.exp(score)) for cat, score in lg_scores.iteritems())
		total = sum(scores.itervalues())
		return dict((cat, prob / total) for cat, prob in scores.iteritems())