Ejemplo n.º 1
0
 def setUp(self):
     self.training = [['spam', './tests/fixtures/plain.eml'],
                      ['ham', './tests/fixtures/small.eml'],
                      ['scram', './tests/fixtures/plain.eml']]
     self.trainer = SpamTrainer(self.training)
     with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file:
         self.email = EmailObject(eml_file)
Ejemplo n.º 2
0
def parse_emails(keyfile):
  emails = []
  print "Parsing emails for " + keyfile

  for line in io.open(keyfile, 'rb'):
    label, file = line.rstrip().split(' ')

    emails.append(EmailObject(io.open(file, 'rb'), category=label))

  print "Done parsing files for " + keyfile
  return emails
Ejemplo n.º 3
0
def parse_emails(keyfile):
    emails = []
    print("Parsing emails for " + keyfile)

    for line in io.open(keyfile, 'r'):
        label, file = line.rstrip().split(' ')

        with io.open(file, 'rb') as eml_file:
            emails.append(EmailObject(eml_file, category=label))

    print("Done parsing files for " + keyfile)
    return emails
Ejemplo n.º 4
0
  def train(self):
    for category, file in self.to_train:
      email = EmailObject(io.open(file, 'rb'))

      self.categories.add(category)
      
      for token in Tokenizer.unique_tokenizer(email.body()):
        self.training[category][token] += 1
        self.totals['_all'] += 1
        self.totals[category] += 1

    self.to_train = {}
Ejemplo n.º 5
0
    def train(self):
        y = []
        for category, file in self.to_train:
            with io.open(file, "rb") as eml_file:
                email = EmailObject(eml_file)

            self.categories.add(category)
            y.append(1 if category == "spam" else 0)

            for token in Tokenizer.unique_tokenizer(email.body()):
                self.training[category][token] += 1
                self.totals["_all"] += 1
                self.totals[category] += 1

        if self.to_train:
            y = np.array(y)
            self.class_log_prior["spam"] = math.log(sum(y == 1) / y.shape[0])
            self.class_log_prior["ham"] = math.log(sum(y == 0) / y.shape[0])
            self.B = len(
                set(self.training["spam"].keys()).union(
                    set(self.training["ham"].keys())
                )
            )
            self.to_train = {}
Ejemplo n.º 6
0
 def setUp(self):
     with io.open('fixtures/html.eml', 'rb') as html_file:
         self.html = html_file.read().decode('utf-8')
         html_file.seek(0)
         self.html_email = EmailObject(html_file)
 def setUp(self):
     self.multipart_file = './tests/fixtures/multipart.eml'
     with io.open(self.multipart_file, 'rb') as multipart:
         self.text = multipart.read().decode('utf-8')
         multipart.seek(0)
         self.multipart_email = EmailObject(multipart)
Ejemplo n.º 8
0
 def setUp(self):
     self.plain_file = 'fixtures/plain.eml'
     with io.open(self.plain_file, 'rb') as plaintext:
         self.text = plaintext.read().decode('utf-8')
         plaintext.seek(0)
         self.plain_email = EmailObject(plaintext)