Example #1
0
    def train(self, training_data=None):
        if training_data:
            self.docs.extend(training_data)

        for category, doc in self.docs:
            for word in th.tokenize(doc):
                self.word_counts[category][word] = self.word_counts[category].get(word, 0.0) + 1
                self.vocabulary[word] = self.vocabulary.get(word, 0.0) + 1

        for word in self.vocabulary.keys():
            self.probabilities[word] = {c: self.word_counts[c].get(word, 0.0) / sum(self.word_counts[c].values()) for c in self.word_counts.keys()}
Example #2
0
    def classify(self, sentence):
        classes = { c: 0.0 for c in self.word_counts.keys() }
        doc_probs = [self.probabilities.get(word, {c: 0.0 for c in classes.keys()}) for word in th.tokenize(sentence)]
        prior_denom = sum([len(self.word_counts[c]) for c in classes.keys()])
        post_nums = {} #posterior numerators
        for c, v in classes.items():
            class_probs = [x[c] for x in doc_probs if x[c] != 0.0]
            prob = self.__product(class_probs) if class_probs else 0.0
            prior = len(self.word_counts[c]) / prior_denom
            post_nums[c] = prob * prior

        post_denom = sum(post_nums.values())
        if not post_denom:
            return 'unknown'
        posteriors = {c: v / post_denom for c, v in post_nums.items()}
        return max(posteriors.keys(), key=lambda c: posteriors[c])