Ejemplo n.º 1
0
 def test(self, test_corpus_dir):
     """
     Tests given emails for being SPAM or HAM
     :param test_corpus_dir: directory with emails
     """
     test_corpus = Corpus(
         test_corpus_dir)  # instance of corpus for walking emails
     self.load_from_memory(
     )  # load memory, in case no training was made, if training was made it makes it better
     for name, msg in test_corpus.emails(
     ):  # for name of file and the email
         values = []
         for a in msg.keys():  # for all email header parts
             a = a.lower()
             if self.classification.get(a):
                 values.append(
                     self.get_stat(a, msg)
                 )  # get status of this part, values closer to 1 mean SPAM
         values.append(
             self.get_stat_payload(msg))  # get status of body/payload
         final_stat = sum(values) / (len(values))  # get final value
         if final_stat > 0.5:  # if value is closer to being SPAM
             self.predictions[name] = 'SPAM'
         else:  # if value is close to HAM
             self.predictions[name] = 'OK'
     utils.write_classification_to_file(
         test_corpus_dir, self.predictions)  # safe created classification
     self.save_to_memory(
     )  # save the memory in case something new was learned on training data
Ejemplo n.º 2
0
 def test(self, dir_path):
     emails = self.get_email_files(dir_path)
     pred_dict = {}
     for mail in emails:
         pred_dict[mail] = self.analyse_email(emails[mail])
     write_classification_to_file(pred_dict,
                                  os.path.join(dir_path, "!prediction.txt"))
Ejemplo n.º 3
0
 def test(self, directory):
     corp = Corpus(directory)
     result = {}
     if self.trained:
         for fname, body in corp.emails():
             SpamSum = 0
             HamSum = 0
             examined = Email(fname, body)
             for spam in self.spams:
                 SpamSum += Email.compare_emails(examined, spam)
             for ham in self.hams:
                 HamSum += Email.compare_emails(examined, ham)
             if SpamSum / len(self.spams) > HamSum / len(self.hams):
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
     else:
         for fname, body in corp.emails():
             counter = 0
             for word in self.blacklist:
                 if word in body:
                     counter += 1
             if counter > 3:
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
Ejemplo n.º 4
0
    def test(self, dir):
        EASING = 0.095
        SLICING = 38
        cls_dict = dict()
        file_name_with_data = dict()

        for filename in os.listdir(dir):
            if filename[0] == "!": continue
            f = open(dir + '/' + filename, 'r', encoding="utf8")
            file_name_with_data.update({filename: f.read()})

        for file_name, email_content in file_name_with_data.items():
            a, b = 1.0, 1.0

            for word, spamicity in \
                    sorted( \
                            [(w, 0.5 if self.spamicity.get(w) == None else self.spamicity[w]) \
                             for w in self.get_tokens(email_content)], \
                            key=lambda x: 0.5 - math.fabs(0.5 - x[1]) \
                            )[0:SLICING]:
                a *= math.fabs(spamicity - EASING)
                b *= 1.0 - spamicity + EASING
                cls_dict.update(
                    {file_name: ("SPAM" if (a / (a + b)) >= 1.0 else "OK")})
        utils.write_classification_to_file(cls_dict, dir + "/!prediction.txt")
Ejemplo n.º 5
0
    def test(self, path_to_corpus_to_evaluate):
        files = os.listdir(path_to_corpus_to_evaluate)
        cls_dict = dict()
        type_list = ['SPAM', 'OK']

        for name in files:
            cls_dict.update({str(name): type_list[random.randint(0, 1)]})
        utils.write_classification_to_file(cls_dict, path_to_corpus_to_evaluate+'\\!prediction.txt')
Ejemplo n.º 6
0
 def test(dir_path):
     result_dict = {}
     files = os.listdir(dir_path)
     for email in files:
         if email[0] != '!':
             result_dict[email] = "OK"
     utils.write_classification_to_file(
         os.path.join(dir_path, '!prediction.txt'), result_dict)
Ejemplo n.º 7
0
 def test(dir_path):
     result_dict = {}
     files = os.listdir(dir_path)
     values = ["SPAM", "OK"]
     for email in files:
         if email[0] != '!':
             result_dict[email] = values[random.randint(0, 1)]
     utils.write_classification_to_file(
         os.path.join(dir_path, '!prediction3.txt'), result_dict)
    def test_correctlyFormattedDict(self):
        input = create_classification()
        save_classification_to_file(input, REFERENCENAME)

        with replaced_open():
            write_classification_to_file(input, FILENAME)

        # Validate results
        self.assertListEqual(list(io.open(REFERENCENAME)),
                             list(io.open(FILENAME)),
                             'Items in written files are not equal.')
    def test_returnEmptyFile_forEmptyDict(self):
        input = dict()
        save_classification_to_file(input, REFERENCENAME)

        with replaced_open():
            write_classification_to_file(input, FILENAME)

        # Validate results
        self.assertListEqual(list(io.open(REFERENCENAME)),
                             list(io.open(FILENAME)),
                             'Items in written files are not equal.')
Ejemplo n.º 10
0
    def test(self, test_corpus_dir):
        '''
        Creates dict of classification and writes it to the file
        :param test_corpus_dir: path to test dir
        :return: None
        '''

        # Prepare "global" variables
        c = Corpus(test_corpus_dir)
        class_dict = {}

        # Iterate over emails with generator in Corpus
        for email in c.emails():
            # Declare probabilities - will be modified
            spam_probability = 0
            ham_probability = 0

            # Get word statistics of email - word frequency and word count
            word_stats = self.get_word_count_for_mail(email[1])
            word_freq = word_stats[0]
            word_count = word_stats[1]

            # Compute spamines of words
            spaminesses = []
            for word in word_freq:
                s = self.get_spaminnes_of_word(word)
                if s is not None:
                    spaminesses.append(s)

            # Caluclates needed parts for further computation
            product = self.prod(spaminesses)
            one_without_spammineses = self.one_without_spaminesses(spaminesses)

            lower = product + one_without_spammineses

            # We cannot divide by zero
            if lower != 0:
                overall_spaminess = product / (product +
                                               one_without_spammineses)
            else:
                overall_spaminess = 0

            # Final decision
            if overall_spaminess >= 0.5:
                class_dict.update({email[0]: "SPAM"})
            else:
                class_dict.update({email[0]: "OK"})

        # Creates !prediction.txt file
        utils.write_classification_to_file(
            test_corpus_dir + "/!prediction.txt", class_dict)
Ejemplo n.º 11
0
    def test(self, test_dir):
        test_files = os.listdir(test_dir)
        for file in test_files:
            test_file_path = test_dir
            test_file_path += '/' + file
            mail = self.get_email(test_file_path)
            mail_words = self.get_email_message(mail)
            word_ratings = []
            """Setting word spam ratings"""
            for word in mail_words:
                if word in self.vocabulary:
                    word_ratings.append(
                        self.word_spaminess.get(word,
                                                self.init_spam_likelihood))
                else:
                    word_ratings.append(self.init_spam_likelihood)
            """Paul Graham - A Plan for Spam method."""
            if len(word_ratings) == 0:
                self.test_files_result_dict[file] = self.decision_table[1]
                continue
            elif len(word_ratings) >= 20:
                """To avoid rounding to zero"""
                word_ratings.sort()
                word_ratings = word_ratings[:10] + word_ratings[-10:]
            """Combining individual probabilities of that the message containing a spam word"""
            """I'm assuming that the words present in the message are independent events. 
            So that's why I'm multiplying all the word ratings."""
            """Product of all word_spaminess in the message."""
            spam_rating_product = reduce(lambda x, y: x * y, word_ratings)
            """Product of all word_haminess in the message."""
            ham_rating_product = reduce(lambda x, y: x * y,
                                        map(lambda x: 1.0 - x, word_ratings))
            result = spam_rating_product / (spam_rating_product +
                                            ham_rating_product)
            """After the email's spam probability is computed over all words in the email, 
            and if the total exceeds a certain threshold, the filter will mark the email as a spam."""
            if result >= 0.95:
                self.test_files_result_dict[file] = self.decision_table[1]
            else:
                self.test_files_result_dict[file] = self.decision_table[0]

        write_classification_to_file(self.test_files_result_dict,
                                     test_dir + '/!prediction.txt')
Ejemplo n.º 12
0
    def test(self, dir_path):
        no_tests_done = 0
        rather_positive = 0
        corpus = Corpus(dir_path)
        clasif = dict()

        for name, mail in corpus.emails():
            # Test strong filters
            result = self.test_strong_filters(name, mail)
            if result != -1:  # Strong filters were decisive
                clasif[name] = result
                continue  # Skip to the next iteration

            score = 0
            tests_done = 0

            # Test normal filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            # Test word filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            if tests_done == 0:
                no_tests_done += 1
                # print("No tests were done for " + name)
                clasif[name] = NEGATIVE
            elif score / tests_done > POSITIVITY_THRESHOLD:
                clasif[name] = POSITIVE
            else:
                if score / tests_done > 0.50:
                    rather_positive += 1
                clasif[name] = NEGATIVE

        utils.write_classification_to_file(clasif,
                                           dir_path + "/!prediction.txt")
Ejemplo n.º 13
0
 def test(self, path_to_corpus_to_evaluate):
     files = os.listdir(path_to_corpus_to_evaluate)
     cls_dict = dict()
     for name in files:
         cls_dict.update({str(name): 'OK'})
     utils.write_classification_to_file(cls_dict, path_to_corpus_to_evaluate)
Ejemplo n.º 14
0
 def test(self, prediction_corpus_path):
     write_classification_to_file(
         self.dictionary, prediction_corpus_path + '/!prediction.txt')