Exemple #1
0
def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score
Exemple #2
0
    def train(self, train_corpus_dir):
        '''
        Trains my silly filter
        :param train_corpus_dir: path to train dir
        :return: None
        '''

        tc = TrainingCorpus(train_corpus_dir)
        tc.return_spam_ham_count()

        # Get word frequencies
        spam = tc.get_spam_word_count_dict_and_avg()
        ham = tc.get_ham_word_count_dict_and_avg()
        self.spam_word_count_dict = spam[0]
        self.ham_word_count_dict = ham[0]

        self.spam_word_count_avg = spam[1]
        self.ham_word_count_avg = ham[1]

        # Remove intersection of them from them
        # intersection = self.spam_word_count_dict & self.ham_word_count_dict
        # self.spam_word_count_dict -= intersection
        # self.ham_word_count_dict -= intersection

        # Totally number of spam and ham counts from test suite
        counts = tc.return_spam_ham_count()
        self.spam_count = counts[0]
        self.ham_count = counts[1]
 def setUp(self):
     """Prepare fake corpus with !truth.txt file."""
     self.email_dict = create_corpus_dictionary()
     self.true_class = create_classification_for(self.email_dict.keys())
     create_corpus_dir_from_dictionary(self.email_dict)
     truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
     save_classification_to_file(self.true_class, fname=truth_filepath)
     with replaced_open():
         self.tc = TrainingCorpus(CORPUS_DIR)
Exemple #4
0
    def train(self, path):
        '''this function creates list of strings
        (bad_words) that are most likely to trigger
        a spam in the test dataset of emails'''

        #these constants worked the best
        MOST_COMMON_S = 600
        MOST_COMMON_H = 5000
        CHECKED_WORD_LEN = 12
        FACTOR = 20

        words = []
        ham_string = ''
        '''this part concentrares all hams to one
        huge string, than creates list of all specific
        words in hams, and dictionary of those words
        with frequency of their appearance'''
        tc = TrainingCorpus(path)
        for fname, body in tc.hams():
            ham_string += body
            body = body.translate(str.maketrans('.', ' '))
            words = words + (body.lower().split(' '))

        counter_ham = Counter(words)
        ham_words_dict = dict(counter_ham.most_common(MOST_COMMON_H))
        ham_words_list = list(ham_words_dict.keys())

        words = []
        '''same for spams except the huge string part'''
        for fname, body in tc.spams():
            body = body.translate(str.maketrans('.', ' '))
            words = words + (body.lower().split(' '))

        couter_spam = Counter(words)
        spam_words_dict = dict(couter_spam.most_common(MOST_COMMON_S))
        spam_words_list = list(spam_words_dict.keys())
        '''this part creates the bad_words list'''
        for word in spam_words_list:

            if word not in ham_string:
                self.bad_words.append(word)

            elif len(
                    word
            ) > CHECKED_WORD_LEN and word in ham_words_list and word in spam_words_list:
                if (spam_words_dict[word] > (ham_words_dict[word] * FACTOR)):
                    self.bad_words.append(word)

        pass
Exemple #5
0
 def train(self, email_adress):
     global all_words, spam_words, probability_spam, count_spams, count_emails
     hemails_with_body = TrainingCorpus(email_adress).hams()
     semails_with_body = TrainingCorpus(email_adress).spams()
     hwords = TrainingCorpus(email_adress).get_words(hemails_with_body)
     swords = TrainingCorpus(email_adress).get_words(semails_with_body)
     all_words = TrainingCorpus(email_adress).all_words(hwords, swords) # all words with their count
     spam_words = TrainingCorpus(email_adress).spam_words(swords) # spam words with their count
     count_spams = TrainingCorpus.count_spams(email_adress) # count of all spam's emails
     count_emails = TrainingCorpus.count_emails(email_adress) # count of all emails
     probability_spam = count_spams / count_emails # probability that email is spam
     pass
Exemple #6
0
    def train(self, path):
        """
        Trains the corpus on given emails dataset
        :param path: directory with emails
        """
        self.truth_dict = read_classification_from_file(path + "/!truth.txt")
        emails = TrainingCorpus(path)

        spam_words, num_of_spam_emails = self.list_spam_ham_words(emails, True)
        ham_words, num_of_ham_emails = self.list_spam_ham_words(emails, False)

        self.portion_of_spam_emails = num_of_spam_emails / (
            num_of_spam_emails + num_of_ham_emails)
        self.all_words = Counter(join_spam_and_ham_words(
            spam_words, ham_words))

        self.num_of_spam_words = len(spam_words)
        self.num_of_ham_words = len(ham_words)
        self.num_of_all_words = len(self.all_words)
Exemple #7
0
 def train(self, dir_path):
     corpus = TrainingCorpus(dir_path)
     for filt in self.strong_filters + self.normal_filters + self.word_filters:
         # print("Training " + filt.__class__.__name__)
         filt.train(corpus)
Exemple #8
0
 def train(self, directory):
     trainer = TrainingCorpus(directory)
     self.spams = trainer.spams
     self.hams = trainer.hams
     self.trained = True
Exemple #9
0
from basefilter import WordFilter
from trainingcorpus import TrainingCorpus
import inspect
import wordfilters

c = TrainingCorpus('./1')
for name, obj in inspect.getmembers(wordfilters):
    if inspect.isclass(obj):
        if obj.__module__ == "wordfilters":
            a = obj()
            a.train(c)
            print(name, a.bayes_val)