Esempio n. 1
0
 def test(self, directory):
     corp = Corpus(directory)
     result = {}
     if self.trained:
         for fname, body in corp.emails():
             SpamSum = 0
             HamSum = 0
             examined = Email(fname, body)
             for spam in self.spams:
                 SpamSum += Email.compare_emails(examined, spam)
             for ham in self.hams:
                 HamSum += Email.compare_emails(examined, ham)
             if SpamSum / len(self.spams) > HamSum / len(self.hams):
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
     else:
         for fname, body in corp.emails():
             counter = 0
             for word in self.blacklist:
                 if word in body:
                     counter += 1
             if counter > 3:
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
Esempio n. 2
0
 def test(self, test_corpus_dir):
     """
     Tests given emails for being SPAM or HAM
     :param test_corpus_dir: directory with emails
     """
     test_corpus = Corpus(
         test_corpus_dir)  # instance of corpus for walking emails
     self.load_from_memory(
     )  # load memory, in case no training was made, if training was made it makes it better
     for name, msg in test_corpus.emails(
     ):  # for name of file and the email
         values = []
         for a in msg.keys():  # for all email header parts
             a = a.lower()
             if self.classification.get(a):
                 values.append(
                     self.get_stat(a, msg)
                 )  # get status of this part, values closer to 1 mean SPAM
         values.append(
             self.get_stat_payload(msg))  # get status of body/payload
         final_stat = sum(values) / (len(values))  # get final value
         if final_stat > 0.5:  # if value is closer to being SPAM
             self.predictions[name] = 'SPAM'
         else:  # if value is close to HAM
             self.predictions[name] = 'OK'
     utils.write_classification_to_file(
         test_corpus_dir, self.predictions)  # safe created classification
     self.save_to_memory(
     )  # save the memory in case something new was learned on training data
Esempio n. 3
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
Esempio n. 4
0
    def test(self, path):
        '''This function determines which emails
        contain at least one "bad word", those are
        marked as spams, the others as ham, then the
        classification is written to !prediction.txt file.'''

        c = Corpus(path)
        for fname, body in c.emails():
            SPAM = False
            body = body.translate(str.maketrans('.', ' '))
            for word in self.bad_words:
                if word in body:
                    SPAM = True
                    break
            if SPAM:
                self.final_dict[fname] = 'SPAM'

            else:
                self.final_dict[fname] = 'OK'

        names = os.listdir(path)
        path0 = os.getcwd()
        os.chdir(path)
        fd = open('!prediction.txt', 'w', encoding='utf-8')
        for name in names:
            if name[0] != '!':
                fd.write(name + ' ' + self.final_dict[name] + '\n')
        fd.close()
        os.chdir(path0)
        pass
Esempio n. 5
0
def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score
Esempio n. 6
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [
                list(i) for i in set(map(tuple, spamicity_list))
            ]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(
                spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
Esempio n. 7
0
    def train(
        self,
        file_path,
        batch_size=10,
        learning_rate=0.1,
        lr_decay=0.05,
        epochs=1000,
        momentum=0.0,
        tuning=False
    ):  # analogous to PLR_filter, tuning parameter plots out a graph of mean loss over epochs
        if tuning:
            og_lr = learning_rate  # original learning rate
            x_plt = []  # x-axis (epochs)
            y_plt = []  # y-axis (mean loss)
        corpus = Corpus(file_path)
        truth_dict = utils.read_classification_from_file(file_path +
                                                         "/!truth.txt")
        got_data = True
        mails_getter = corpus.emails()
        batches = []
        while got_data:
            batch = []
            for i in range(batch_size):
                try:
                    email = next(mails_getter)
                    batch.append(
                        (email[1],
                         1 if truth_dict[email[0]] == self.pos_tag else 0))
                except StopIteration:
                    got_data = False
                    break
            batches.append(batch)

        for e in range(epochs):
            if tuning:
                steps = 0
            print(learning_rate)
            self.init_momentums()
            loss = 0
            for batch in batches:
                batch_vectors = [(m[0].get_feature_vector_lr()) for m in batch]
                y = [m[1] for m in batch]
                loss += self.gradient_descent(y, batch_vectors, learning_rate,
                                              momentum)
                if tuning:
                    steps += 1
                print(f"trained on epoch #{e +1}")
            learning_rate *= 1 / (1 + lr_decay * e)
            if tuning:
                y_plt.append(loss / steps)
                x_plt.append(e)
        if tuning:
            plt.plot(x_plt, y_plt)
            plt.title(
                f"lr:{og_lr} lrd:{lr_decay} bs:{batch_size} m: {momentum} e:{epochs}"
            )
            plt.xlabel("epochs")
            plt.ylabel("mean loss")
            plt.show()
Esempio n. 8
0
 def test(self, mails_path):
     try:
         os.remove(mails_path + "/!prediction.txt")
     except:
         pass
     corpus = Corpus(mails_path)
     with open(mails_path + "/!prediction.txt", 'a', encoding='utf-8') as f:
         for mail in corpus.emails():
             res = self.evaluate_mail(mail[1])
             f.write(f"{mail[0]} {self.pos_tag if res else self.neg_tag}\n")
Esempio n. 9
0
 def __init__(self, folder):
     self.folder = folder
     self.spams = []
     self.hams = []
     corp = Corpus(folder)
     for fname, content in corp.emails():
         if self.is_ham(fname):
             self.hams.append(Email(fname, content))
         else:
             self.spams.append(Email(fname, content))
Esempio n. 10
0
 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')
Esempio n. 11
0
    def test(self, test_corpus_dir):
        test_corpus = Corpus(test_corpus_dir)
        with open(os.path.join(test_corpus_dir, "!prediction.txt"), "w+") as a_file:
            for filename, body in test_corpus.emails():
                if self.bayesian_combination(body) > 0.9 or self.get_email_adress(body) in self.black_list:
                    decision = "SPAM"

                else:
                    if self.get_email_adress(body) in self.white_list:
                        decision = "OK"
                    else:
                        decision = "OK"
                a_file.write(filename + " " + decision + "\n")
Esempio n. 12
0
    def test(self, test_corpus_dir):
        '''
        Creates dict of classification and writes it to the file
        :param test_corpus_dir: path to test dir
        :return: None
        '''

        # Prepare "global" variables
        c = Corpus(test_corpus_dir)
        class_dict = {}

        # Iterate over emails with generator in Corpus
        for email in c.emails():
            # Declare probabilities - will be modified
            spam_probability = 0
            ham_probability = 0

            # Get word statistics of email - word frequency and word count
            word_stats = self.get_word_count_for_mail(email[1])
            word_freq = word_stats[0]
            word_count = word_stats[1]

            # Compute spamines of words
            spaminesses = []
            for word in word_freq:
                s = self.get_spaminnes_of_word(word)
                if s is not None:
                    spaminesses.append(s)

            # Caluclates needed parts for further computation
            product = self.prod(spaminesses)
            one_without_spammineses = self.one_without_spaminesses(spaminesses)

            lower = product + one_without_spammineses

            # We cannot divide by zero
            if lower != 0:
                overall_spaminess = product / (product +
                                               one_without_spammineses)
            else:
                overall_spaminess = 0

            # Final decision
            if overall_spaminess >= 0.5:
                class_dict.update({email[0]: "SPAM"})
            else:
                class_dict.update({email[0]: "OK"})

        # Creates !prediction.txt file
        utils.write_classification_to_file(
            test_corpus_dir + "/!prediction.txt", class_dict)
Esempio n. 13
0
 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')
Esempio n. 14
0
 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(
         fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR)     
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')
Esempio n. 15
0
 def train(self,
           file_path,
           batch_size=10,
           learning_rate=0.1,
           lr_decay=0.05,
           epochs=1000,
           momentum=0.0):
     corpus = Corpus(file_path)
     truth_dict = utils.read_classification_from_file(file_path +
                                                      "/!truth.txt")
     got_data = True
     mails_getter = corpus.emails()
     batches = []
     # loads all data from directory in batches of given size
     while got_data:
         batch = []
         # loads a batch of given size, a smaller one if out of data
         for i in range(batch_size):
             try:
                 email = next(mails_getter)
                 batch.append(
                     (email[1],
                      1 if truth_dict[email[0]] == self.pos_tag else 0))
             except StopIteration:
                 got_data = False
                 break
         batches.append(batch)
     for e in range(epochs):  # trains multiple times on all batches
         self.init_momentums()
         for batch in batches:  # performs gradient descent on each bach
             # gets feature vectors for batch
             feature_vectors = [
                 (m[0].get_feature_vector_plr()) for m in batch
             ]  # gets feature vectors of the batch
             y = [m[1] for m in batch]  # gets the truth vector of the batch
             for i in range(
                     self.subvector_count
             ):  # weights for each subvector are trained separately
                 subvector_batch = [
                     v[i] for v in feature_vectors
                 ]  # isolates a subvector from all vectors
                 self.gradient_descent(i, y, subvector_batch, learning_rate,
                                       momentum)
             print(f"trained on epoch #{e +1}")
         learning_rate *= 1 / (1 + lr_decay * e)
Esempio n. 16
0
 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(fname=SPECIAL_FILENAME,
                             contents='fake',
                             dirname=CORPUS_DIR)
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')
Esempio n. 17
0
    def train(self, file_path):
        self.content_spam_dict = {}
        self.content_ham_dict = {}
        class_dict = utils.read_classification_from_file(file_path +
                                                         '/!truth.txt')
        corpus = Corpus(file_path)
        email_generator = corpus.emails()
        content_counter_spam = Counter()
        content_counter_ham = Counter()
        content_wordcount_spam = 0
        content_wordcount_ham = 0

        spam_count = 0
        ham_count = 0
        every_word_content = set()

        for mail in email_generator:
            content_words = self.string_to_words(mail[1].content_no_html)
            content_counter = Counter(content_words)
            for word in content_words:
                every_word_content.add(word)

            if class_dict[mail[0]] == self.pos_tag:
                spam_count += 1
                content_counter_spam += content_counter
                content_wordcount_spam += len(content_words)
            else:
                ham_count += 1
                content_counter_ham += content_counter
                content_wordcount_ham += len(content_words)

        for word in every_word_content:
            content_counter_ham[word] += 1
            content_counter_spam[word] += 1
            self.content_spam_dict[
                word] = content_counter_spam[word] / content_wordcount_spam
            self.content_ham_dict[
                word] = content_counter_ham[word] / content_wordcount_ham

        self.spam_probability = spam_count / (spam_count + ham_count)
        self.ham_probability = ham_count / (spam_count + ham_count)
        self.trained = True
Esempio n. 18
0
 def test(self, path):
     """
     Tests given emails for being SPAM or HAM
     :param path: directory with emails
     """
     emails = Corpus(path)
     with open(path + "/!prediction.txt", 'w', encoding="utf-8") as f:
         self.alpha = self.calculate_alpha(emails)
         for filename, message in emails.emails():
             words = raw_email_to_list_of_words(message)
             spam_probability, spam_probability_overflow = self.calculate_email_probability(
                 words, True)
             ham_probability, ham_probability_overflow = self.calculate_email_probability(
                 words, False)
             if decision(spam_probability_overflow, spam_probability,
                         ham_probability_overflow, ham_probability):
                 f.write(filename + " SPAM\n")
                 self.pred_dict[filename] = "SPAM"
             else:
                 f.write(filename + " OK\n")
                 self.pred_dict[filename] = "OK"
Esempio n. 19
0
    def test(self, dir_path):
        no_tests_done = 0
        rather_positive = 0
        corpus = Corpus(dir_path)
        clasif = dict()

        for name, mail in corpus.emails():
            # Test strong filters
            result = self.test_strong_filters(name, mail)
            if result != -1:  # Strong filters were decisive
                clasif[name] = result
                continue  # Skip to the next iteration

            score = 0
            tests_done = 0

            # Test normal filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            # Test word filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            if tests_done == 0:
                no_tests_done += 1
                # print("No tests were done for " + name)
                clasif[name] = NEGATIVE
            elif score / tests_done > POSITIVITY_THRESHOLD:
                clasif[name] = POSITIVE
            else:
                if score / tests_done > 0.50:
                    rather_positive += 1
                clasif[name] = NEGATIVE

        utils.write_classification_to_file(clasif,
                                           dir_path + "/!prediction.txt")
Esempio n. 20
0
def set_truth(path):
    f = open(os.path.join(path, "!truth.txt"), 'wt')
    a = Corpus('/Users/eygene/Desktop/spam-data-12-s75-h25/3')
    for name, body in a.emails():
        f.write(name + ' ' + 'SPAM' + '\n')