Ejemplo n.º 1
0
 def setUp(self):
     self.training = [['spam', './tests/fixtures/plain.eml'],
                      ['ham', './tests/fixtures/small.eml'],
                      ['scram', './tests/fixtures/plain.eml']]
     self.trainer = SpamTrainer(self.training)
     with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file:
         self.email = EmailObject(eml_file)
def label_to_training_data(fold_file):
    training_data = []

    for line in io.open(fold_file, "r"):
        label_file = line.rstrip().split(" ")
        training_data.append(label_file)

    return SpamTrainer(training_data)
Ejemplo n.º 3
0
def label_to_training_data(fold_file):
  training_data = []
  
  for line in io.open(fold_file, 'rb'):
    label_file = line.rstrip().split(' ')
    training_data.append(label_file)

  print training_data
  return SpamTrainer(training_data)
  def test_give_preference_to_whatever_has_the_most(self):
    trainer = self.trainer
    score = trainer.score(self.email)

    preference = trainer.preference()[-1]
    preference_score = score[preference]

    expected = SpamTrainer.Classification(preference, preference_score)
    self.assertEqual(trainer.classify(self.email), expected)
class TestSpamTrainer(unittest.TestCase):
  def setUp(self):
    self.training = [['spam', './tests/fixtures/plain.eml'], ['ham', './tests/fixtures/small.eml'],
                     ['scram', './tests/fixtures/plain.eml']]
    self.trainer = SpamTrainer(self.training)
    with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file:
      self.email = EmailObject(eml_file)

  def test_multiple_categories(self):
    categories = self.trainer.categories
    expected = set([k for k, v in self.training])
    self.assertEqual(categories, expected)

  def test_counts_all_at_zero(self):
    for cat in ['_all', 'spam', 'ham', 'scram']:
      self.assertEqual(self.trainer.total_for(cat), 0)

  def test_preference_category(self):
    trainer = self.trainer
    expected = sorted(trainer.categories, key=lambda cat: trainer.total_for(cat))

    self.assertEqual(trainer.preference(), expected)

  #! test for score(), since training data is uniform across the categories there is no reason for the score to differ across them.
  def test_probability_being_1_over_n(self):
    trainer = self.trainer
    scores = list(trainer.score(self.email).values())

    #* compare first and last item in scores, assertAlmostEqual() used to ignore rounding errors? followed by comparing each nth element of scores with the n+1th element.
    self.assertAlmostEqual(scores[0], scores[-1])

    for i in range(len(scores) - 1):
      self.assertAlmostEqual(scores[i], scores[i + 1])

  def test_adds_up_to_one(self):
    trainer = self.trainer
    scores = list(trainer.normalized_score(self.email).values())
    self.assertAlmostEqual(sum(scores), 1)
    self.assertAlmostEqual(scores[0], 1 / 2.0)

  def test_give_preference_to_whatever_has_the_most(self):
    trainer = self.trainer
    score = trainer.score(self.email)

    preference = trainer.preference()[-1]
    preference_score = score[preference]

    expected = SpamTrainer.Classification(preference, preference_score)
    self.assertEqual(trainer.classify(self.email), expected)
Ejemplo n.º 6
0
class TestSpamTrainer(unittest.TestCase):
    def setUp(self):
        self.training = [['spam', './tests/fixtures/plain.eml'],
                         ['ham', './tests/fixtures/small.eml'],
                         ['scram', './tests/fixtures/plain.eml']]
        self.trainer = SpamTrainer(self.training)
        with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file:
            self.email = EmailObject(eml_file)

    def test_multiple_categories(self):
        categories = self.trainer.categories
        expected = set([k for k, v in self.training])
        self.assertEqual(categories, expected)

    def test_counts_all_at_zero(self):
        for cat in ['_all', 'spam', 'ham', 'scram']:
            self.assertEqual(self.trainer.total_for(cat), 0)

    def test_preference_category(self):
        trainer = self.trainer
        expected = sorted(trainer.categories,
                          key=lambda cat: trainer.total_for(cat))

        self.assertEqual(trainer.preference(), expected)

    def test_probability_being_1_over_n(self):
        trainer = self.trainer
        scores = list(trainer.score(self.email).values())

        self.assertAlmostEqual(scores[0], scores[-1])

        for i in range(len(scores) - 1):
            self.assertAlmostEqual(scores[i], scores[i + 1])

    def test_adds_up_to_one(self):
        trainer = self.trainer
        scores = list(trainer.normalized_score(self.email).values())
        self.assertAlmostEqual(sum(scores), 1)
        self.assertAlmostEqual(scores[0], 1 / 2.0)

    def test_give_preference_to_whatever_has_the_most(self):
        trainer = self.trainer
        score = trainer.score(self.email)

        preference = trainer.preference()[-1]
        preference_score = score[preference]

        expected = SpamTrainer.Classification(preference, preference_score)
        self.assertEqual(trainer.classify(self.email), expected)
Ejemplo n.º 7
0
class TestSpamTrainer(unittest.TestCase):
  def setUp(self):
    self.training = [['spam', './tests/fixtures/plain.eml'], ['ham', './tests/fixtures/small.eml'],
                     ['scram', './tests/fixtures/plain.eml']]
    self.trainer = SpamTrainer(self.training)
    with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file:
      self.email = EmailObject(eml_file)

  def test_multiple_categories(self):
    categories = self.trainer.categories
    expected = set([k for k, v in self.training])
    self.assertEqual(categories, expected)

  def test_counts_all_at_zero(self):
    for cat in ['_all', 'spam', 'ham', 'scram']:
      self.assertEqual(self.trainer.total_for(cat), 0)

  def test_preference_category(self):
    trainer = self.trainer
    expected = sorted(trainer.categories, key=lambda cat: trainer.total_for(cat))

    self.assertEqual(trainer.preference(), expected)

  def test_probability_being_1_over_n(self):
    trainer = self.trainer
    scores = list(trainer.score(self.email).values())

    self.assertAlmostEqual(scores[0], scores[-1])

    for i in range(len(scores) - 1):
      self.assertAlmostEqual(scores[i], scores[i + 1])

  def test_adds_up_to_one(self):
    trainer = self.trainer
    scores = list(trainer.normalized_score(self.email).values())
    self.assertAlmostEqual(sum(scores), 1)
    self.assertAlmostEqual(scores[0], 1 / 2.0)

  def test_give_preference_to_whatever_has_the_most(self):
    trainer = self.trainer
    score = trainer.score(self.email)

    preference = trainer.preference()[-1]
    preference_score = score[preference]

    expected = SpamTrainer.Classification(preference, preference_score)
    self.assertEqual(trainer.classify(self.email), expected)
Ejemplo n.º 8
0
 def setUp(self):
   self.training = [['spam', './tests/fixtures/plain.eml'], ['ham', './tests/fixtures/small.eml'],
                    ['scram', './tests/fixtures/plain.eml']]
   self.trainer = SpamTrainer(self.training)
   with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file:
     self.email = EmailObject(eml_file)