コード例 #1
0
ファイル: filter.py プロジェクト: skalahonza/SpamFilter
 def test(self, directory):
     corp = Corpus(directory)
     result = {}
     if self.trained:
         for fname, body in corp.emails():
             SpamSum = 0
             HamSum = 0
             examined = Email(fname, body)
             for spam in self.spams:
                 SpamSum += Email.compare_emails(examined, spam)
             for ham in self.hams:
                 HamSum += Email.compare_emails(examined, ham)
             if SpamSum / len(self.spams) > HamSum / len(self.hams):
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
     else:
         for fname, body in corp.emails():
             counter = 0
             for word in self.blacklist:
                 if word in body:
                     counter += 1
             if counter > 3:
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
コード例 #2
0
 def test(self, test_corpus_dir):
     """
     Tests given emails for being SPAM or HAM
     :param test_corpus_dir: directory with emails
     """
     test_corpus = Corpus(
         test_corpus_dir)  # instance of corpus for walking emails
     self.load_from_memory(
     )  # load memory, in case no training was made, if training was made it makes it better
     for name, msg in test_corpus.emails(
     ):  # for name of file and the email
         values = []
         for a in msg.keys():  # for all email header parts
             a = a.lower()
             if self.classification.get(a):
                 values.append(
                     self.get_stat(a, msg)
                 )  # get status of this part, values closer to 1 mean SPAM
         values.append(
             self.get_stat_payload(msg))  # get status of body/payload
         final_stat = sum(values) / (len(values))  # get final value
         if final_stat > 0.5:  # if value is closer to being SPAM
             self.predictions[name] = 'SPAM'
         else:  # if value is close to HAM
             self.predictions[name] = 'OK'
     utils.write_classification_to_file(
         test_corpus_dir, self.predictions)  # safe created classification
     self.save_to_memory(
     )  # save the memory in case something new was learned on training data
コード例 #3
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
コード例 #4
0
    def test(self, path):
        '''This function determines which emails
        contain at least one "bad word", those are
        marked as spams, the others as ham, then the
        classification is written to !prediction.txt file.'''

        c = Corpus(path)
        for fname, body in c.emails():
            SPAM = False
            body = body.translate(str.maketrans('.', ' '))
            for word in self.bad_words:
                if word in body:
                    SPAM = True
                    break
            if SPAM:
                self.final_dict[fname] = 'SPAM'

            else:
                self.final_dict[fname] = 'OK'

        names = os.listdir(path)
        path0 = os.getcwd()
        os.chdir(path)
        fd = open('!prediction.txt', 'w', encoding='utf-8')
        for name in names:
            if name[0] != '!':
                fd.write(name + ' ' + self.final_dict[name] + '\n')
        fd.close()
        os.chdir(path0)
        pass
コード例 #5
0
ファイル: quality.py プロジェクト: Scytheroid/spam-filter
def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score
コード例 #6
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [
                list(i) for i in set(map(tuple, spamicity_list))
            ]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(
                spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
コード例 #7
0
    def train(
        self,
        file_path,
        batch_size=10,
        learning_rate=0.1,
        lr_decay=0.05,
        epochs=1000,
        momentum=0.0,
        tuning=False
    ):  # analogous to PLR_filter, tuning parameter plots out a graph of mean loss over epochs
        if tuning:
            og_lr = learning_rate  # original learning rate
            x_plt = []  # x-axis (epochs)
            y_plt = []  # y-axis (mean loss)
        corpus = Corpus(file_path)
        truth_dict = utils.read_classification_from_file(file_path +
                                                         "/!truth.txt")
        got_data = True
        mails_getter = corpus.emails()
        batches = []
        while got_data:
            batch = []
            for i in range(batch_size):
                try:
                    email = next(mails_getter)
                    batch.append(
                        (email[1],
                         1 if truth_dict[email[0]] == self.pos_tag else 0))
                except StopIteration:
                    got_data = False
                    break
            batches.append(batch)

        for e in range(epochs):
            if tuning:
                steps = 0
            print(learning_rate)
            self.init_momentums()
            loss = 0
            for batch in batches:
                batch_vectors = [(m[0].get_feature_vector_lr()) for m in batch]
                y = [m[1] for m in batch]
                loss += self.gradient_descent(y, batch_vectors, learning_rate,
                                              momentum)
                if tuning:
                    steps += 1
                print(f"trained on epoch #{e +1}")
            learning_rate *= 1 / (1 + lr_decay * e)
            if tuning:
                y_plt.append(loss / steps)
                x_plt.append(e)
        if tuning:
            plt.plot(x_plt, y_plt)
            plt.title(
                f"lr:{og_lr} lrd:{lr_decay} bs:{batch_size} m: {momentum} e:{epochs}"
            )
            plt.xlabel("epochs")
            plt.ylabel("mean loss")
            plt.show()
コード例 #8
0
 def test(self, mails_path):
     try:
         os.remove(mails_path + "/!prediction.txt")
     except:
         pass
     corpus = Corpus(mails_path)
     with open(mails_path + "/!prediction.txt", 'a', encoding='utf-8') as f:
         for mail in corpus.emails():
             res = self.evaluate_mail(mail[1])
             f.write(f"{mail[0]} {self.pos_tag if res else self.neg_tag}\n")
コード例 #9
0
 def __init__(self, folder):
     self.folder = folder
     self.spams = []
     self.hams = []
     corp = Corpus(folder)
     for fname, content in corp.emails():
         if self.is_ham(fname):
             self.hams.append(Email(fname, content))
         else:
             self.spams.append(Email(fname, content))
コード例 #10
0
 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')
コード例 #11
0
ファイル: filter.py プロジェクト: EugeneEugene/SpamFilter
    def test(self, test_corpus_dir):
        test_corpus = Corpus(test_corpus_dir)
        with open(os.path.join(test_corpus_dir, "!prediction.txt"), "w+") as a_file:
            for filename, body in test_corpus.emails():
                if self.bayesian_combination(body) > 0.9 or self.get_email_adress(body) in self.black_list:
                    decision = "SPAM"

                else:
                    if self.get_email_adress(body) in self.white_list:
                        decision = "OK"
                    else:
                        decision = "OK"
                a_file.write(filename + " " + decision + "\n")
コード例 #12
0
ファイル: filter.py プロジェクト: pospisil98/CTUcodes
    def test(self, test_corpus_dir):
        '''
        Creates dict of classification and writes it to the file
        :param test_corpus_dir: path to test dir
        :return: None
        '''

        # Prepare "global" variables
        c = Corpus(test_corpus_dir)
        class_dict = {}

        # Iterate over emails with generator in Corpus
        for email in c.emails():
            # Declare probabilities - will be modified
            spam_probability = 0
            ham_probability = 0

            # Get word statistics of email - word frequency and word count
            word_stats = self.get_word_count_for_mail(email[1])
            word_freq = word_stats[0]
            word_count = word_stats[1]

            # Compute spamines of words
            spaminesses = []
            for word in word_freq:
                s = self.get_spaminnes_of_word(word)
                if s is not None:
                    spaminesses.append(s)

            # Caluclates needed parts for further computation
            product = self.prod(spaminesses)
            one_without_spammineses = self.one_without_spaminesses(spaminesses)

            lower = product + one_without_spammineses

            # We cannot divide by zero
            if lower != 0:
                overall_spaminess = product / (product +
                                               one_without_spammineses)
            else:
                overall_spaminess = 0

            # Final decision
            if overall_spaminess >= 0.5:
                class_dict.update({email[0]: "SPAM"})
            else:
                class_dict.update({email[0]: "OK"})

        # Creates !prediction.txt file
        utils.write_classification_to_file(
            test_corpus_dir + "/!prediction.txt", class_dict)
コード例 #13
0
ファイル: test_corpus.py プロジェクト: skalahonza/SpamFilter
 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')
コード例 #14
0
 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(
         fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR)     
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')
コード例 #15
0
 def train(self,
           file_path,
           batch_size=10,
           learning_rate=0.1,
           lr_decay=0.05,
           epochs=1000,
           momentum=0.0):
     corpus = Corpus(file_path)
     truth_dict = utils.read_classification_from_file(file_path +
                                                      "/!truth.txt")
     got_data = True
     mails_getter = corpus.emails()
     batches = []
     # loads all data from directory in batches of given size
     while got_data:
         batch = []
         # loads a batch of given size, a smaller one if out of data
         for i in range(batch_size):
             try:
                 email = next(mails_getter)
                 batch.append(
                     (email[1],
                      1 if truth_dict[email[0]] == self.pos_tag else 0))
             except StopIteration:
                 got_data = False
                 break
         batches.append(batch)
     for e in range(epochs):  # trains multiple times on all batches
         self.init_momentums()
         for batch in batches:  # performs gradient descent on each bach
             # gets feature vectors for batch
             feature_vectors = [
                 (m[0].get_feature_vector_plr()) for m in batch
             ]  # gets feature vectors of the batch
             y = [m[1] for m in batch]  # gets the truth vector of the batch
             for i in range(
                     self.subvector_count
             ):  # weights for each subvector are trained separately
                 subvector_batch = [
                     v[i] for v in feature_vectors
                 ]  # isolates a subvector from all vectors
                 self.gradient_descent(i, y, subvector_batch, learning_rate,
                                       momentum)
             print(f"trained on epoch #{e +1}")
         learning_rate *= 1 / (1 + lr_decay * e)
コード例 #16
0
ファイル: test_corpus.py プロジェクト: skalahonza/SpamFilter
 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(fname=SPECIAL_FILENAME,
                             contents='fake',
                             dirname=CORPUS_DIR)
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')
コード例 #17
0
    def train(self, file_path):
        self.content_spam_dict = {}
        self.content_ham_dict = {}
        class_dict = utils.read_classification_from_file(file_path +
                                                         '/!truth.txt')
        corpus = Corpus(file_path)
        email_generator = corpus.emails()
        content_counter_spam = Counter()
        content_counter_ham = Counter()
        content_wordcount_spam = 0
        content_wordcount_ham = 0

        spam_count = 0
        ham_count = 0
        every_word_content = set()

        for mail in email_generator:
            content_words = self.string_to_words(mail[1].content_no_html)
            content_counter = Counter(content_words)
            for word in content_words:
                every_word_content.add(word)

            if class_dict[mail[0]] == self.pos_tag:
                spam_count += 1
                content_counter_spam += content_counter
                content_wordcount_spam += len(content_words)
            else:
                ham_count += 1
                content_counter_ham += content_counter
                content_wordcount_ham += len(content_words)

        for word in every_word_content:
            content_counter_ham[word] += 1
            content_counter_spam[word] += 1
            self.content_spam_dict[
                word] = content_counter_spam[word] / content_wordcount_spam
            self.content_ham_dict[
                word] = content_counter_ham[word] / content_wordcount_ham

        self.spam_probability = spam_count / (spam_count + ham_count)
        self.ham_probability = ham_count / (spam_count + ham_count)
        self.trained = True
コード例 #18
0
 def test(self, path):
     """
     Tests given emails for being SPAM or HAM
     :param path: directory with emails
     """
     emails = Corpus(path)
     with open(path + "/!prediction.txt", 'w', encoding="utf-8") as f:
         self.alpha = self.calculate_alpha(emails)
         for filename, message in emails.emails():
             words = raw_email_to_list_of_words(message)
             spam_probability, spam_probability_overflow = self.calculate_email_probability(
                 words, True)
             ham_probability, ham_probability_overflow = self.calculate_email_probability(
                 words, False)
             if decision(spam_probability_overflow, spam_probability,
                         ham_probability_overflow, ham_probability):
                 f.write(filename + " SPAM\n")
                 self.pred_dict[filename] = "SPAM"
             else:
                 f.write(filename + " OK\n")
                 self.pred_dict[filename] = "OK"
コード例 #19
0
ファイル: filter.py プロジェクト: Scytheroid/spam-filter
    def test(self, dir_path):
        no_tests_done = 0
        rather_positive = 0
        corpus = Corpus(dir_path)
        clasif = dict()

        for name, mail in corpus.emails():
            # Test strong filters
            result = self.test_strong_filters(name, mail)
            if result != -1:  # Strong filters were decisive
                clasif[name] = result
                continue  # Skip to the next iteration

            score = 0
            tests_done = 0

            # Test normal filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            # Test word filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            if tests_done == 0:
                no_tests_done += 1
                # print("No tests were done for " + name)
                clasif[name] = NEGATIVE
            elif score / tests_done > POSITIVITY_THRESHOLD:
                clasif[name] = POSITIVE
            else:
                if score / tests_done > 0.50:
                    rather_positive += 1
                clasif[name] = NEGATIVE

        utils.write_classification_to_file(clasif,
                                           dir_path + "/!prediction.txt")
コード例 #20
0
def set_truth(path):
    f = open(os.path.join(path, "!truth.txt"), 'wt')
    a = Corpus('/Users/eygene/Desktop/spam-data-12-s75-h25/3')
    for name, body in a.emails():
        f.write(name + ' ' + 'SPAM' + '\n')