def test_ngrams(self): expectation = [ [u'\u0000', "quick"], ["quick", "brown"], ["brown", "fox"], ] actual = Tokenizer.ngram("quick brown fox", 2) self.assertEqual(actual, expectation)
def test_ngrams(self): expectation = [ [u'\u0000', 'quick'], ['quick', 'brown'], ['brown', 'fox'] ] actual = Tokenizer.ngram('quick brown fox', 2) self.assertEqual(actual, expectation)
def train(self): for category, file in self.to_train: email = EmailObject(io.open(file, 'rb')) self.categories.add(category) for token in Tokenizer.unique_tokenizer(email.body()): self.training[category][token] += 1 self.totals['_all'] += 1 self.totals[category] += 1 self.to_train = {}
def score(self, email): self.train() cat_totals = self.totals aggregates = { cat: cat_totals[cat] / cat_totals['_all'] for cat in self.categories } for token in Tokenizer.unique_tokenizer(email.body()): for cat in self.categories: value = self.training[cat][token] r = (value + 1) / (cat_totals[cat] + 1) aggregates[cat] *= r return aggregates
def score(self, email): """ Calculates score :param email: EmailObject :return: float number """ self.train() cat_totals = self.totals aggregates = { cat: cat_totals[cat] / cat_totals['_all'] for cat in self.categories } for token in Tokenizer.unique_tokenizer(email.body()): for cat in self.categories: value = self.training[cat][token] r = (value + 1) / (cat_totals[cat] + 1) aggregates[cat] *= r return aggregates
def test_downcasing(self): expectation = ["this", "is", "all", "caps"] actual = Tokenizer.tokenize("THIS IS ALL CAPS") self.assertEqual(actual, expectation)
def test_cowncasting(self): expectation = ['this', 'is', 'all', 'caps'] actual = Tokenizer.tokenize('THIS IS ALL CAPS') self.assertEqual(actual, expectation)