def setUp(self): """Prepare fake corpus with !truth.txt file.""" self.email_dict = create_corpus_dictionary() self.true_class = create_classification_for(self.email_dict.keys()) create_corpus_dir_from_dictionary(self.email_dict) truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME) save_classification_to_file(self.true_class, fname=truth_filepath) with replaced_open(): self.tc = TrainingCorpus(CORPUS_DIR)
def train(self, path): '''this function creates list of strings (bad_words) that are most likely to trigger a spam in the test dataset of emails''' #these constants worked the best MOST_COMMON_S = 600 MOST_COMMON_H = 5000 CHECKED_WORD_LEN = 12 FACTOR = 20 words = [] ham_string = '' '''this part concentrares all hams to one huge string, than creates list of all specific words in hams, and dictionary of those words with frequency of their appearance''' tc = TrainingCorpus(path) for fname, body in tc.hams(): ham_string += body body = body.translate(str.maketrans('.', ' ')) words = words + (body.lower().split(' ')) counter_ham = Counter(words) ham_words_dict = dict(counter_ham.most_common(MOST_COMMON_H)) ham_words_list = list(ham_words_dict.keys()) words = [] '''same for spams except the huge string part''' for fname, body in tc.spams(): body = body.translate(str.maketrans('.', ' ')) words = words + (body.lower().split(' ')) couter_spam = Counter(words) spam_words_dict = dict(couter_spam.most_common(MOST_COMMON_S)) spam_words_list = list(spam_words_dict.keys()) '''this part creates the bad_words list''' for word in spam_words_list: if word not in ham_string: self.bad_words.append(word) elif len( word ) > CHECKED_WORD_LEN and word in ham_words_list and word in spam_words_list: if (spam_words_dict[word] > (ham_words_dict[word] * FACTOR)): self.bad_words.append(word) pass
def test_atom_filter(initialized_filter, train_dir, test_dir): train_corp = TrainingCorpus(train_dir) test_corp = Corpus(test_dir) filter = initialized_filter filter.train(train_corp) prediction = dict() for name, mail in test_corp.emails(): result = filter.test(mail) if result == -1: continue elif result > POSITIVITY_THRESHOLD: prediction[name] = POSITIVE else: prediction[name] = NEGATIVE truth = read_classification_from_file(test_dir + '/' + TRUTHFILE) conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE) conf_matrix.compute_from_dicts(truth, prediction) matrix_dict = conf_matrix.as_dict() # For testing purposes print(matrix_dict) score = quality_score(matrix_dict['tp'], \ matrix_dict['tn'], \ matrix_dict['fp'], \ matrix_dict['fn']) return score
def setUp(self): """Prepare fake corpus with !truth.txt file.""" self.email_dict = create_corpus_dictionary() self.true_class = create_classification_for(self.email_dict.keys()) create_corpus_dir_from_dictionary(self.email_dict) truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME) save_classification_to_file(self.true_class, fname=truth_filepath) self.tc = TrainingCorpus(CORPUS_DIR)
def train(self, train_corpus_dir): ''' Trains my silly filter :param train_corpus_dir: path to train dir :return: None ''' tc = TrainingCorpus(train_corpus_dir) tc.return_spam_ham_count() # Get word frequencies spam = tc.get_spam_word_count_dict_and_avg() ham = tc.get_ham_word_count_dict_and_avg() self.spam_word_count_dict = spam[0] self.ham_word_count_dict = ham[0] self.spam_word_count_avg = spam[1] self.ham_word_count_avg = ham[1] # Remove intersection of them from them # intersection = self.spam_word_count_dict & self.ham_word_count_dict # self.spam_word_count_dict -= intersection # self.ham_word_count_dict -= intersection # Totally number of spam and ham counts from test suite counts = tc.return_spam_ham_count() self.spam_count = counts[0] self.ham_count = counts[1]
def train(self, corpus_dir): '''Train method the filter uses to teach according to !truth.txt file.''' self.train_corpus_dir = corpus_dir # test if the !truth.txt exists, else exit (no error raised) truth_file = os.path.join(corpus_dir, '!truth.txt') if os.path.exists(truth_file)==False: self.trained = False return # create TrainingCorpus object for better training handling Corpus = TrainingCorpus(corpus_dir) # get spams/hams senders, return_paths and subjects for fname in Corpus.truth_dict: (sender, subject) = Corpus.parse_email(fname) if Corpus.truth_dict[fname] == self.neg_tag: self.save_ham_header(sender, subject) else: self.save_spam_header(sender, subject)
def train(self, email_adress): global all_words, spam_words, probability_spam, count_spams, count_emails hemails_with_body = TrainingCorpus(email_adress).hams() semails_with_body = TrainingCorpus(email_adress).spams() hwords = TrainingCorpus(email_adress).get_words(hemails_with_body) swords = TrainingCorpus(email_adress).get_words(semails_with_body) all_words = TrainingCorpus(email_adress).all_words(hwords, swords) # all words with their count spam_words = TrainingCorpus(email_adress).spam_words(swords) # spam words with their count count_spams = TrainingCorpus.count_spams(email_adress) # count of all spam's emails count_emails = TrainingCorpus.count_emails(email_adress) # count of all emails probability_spam = count_spams / count_emails # probability that email is spam pass
def train(self, path): """ Trains the corpus on given emails dataset :param path: directory with emails """ self.truth_dict = read_classification_from_file(path + "/!truth.txt") emails = TrainingCorpus(path) spam_words, num_of_spam_emails = self.list_spam_ham_words(emails, True) ham_words, num_of_ham_emails = self.list_spam_ham_words(emails, False) self.portion_of_spam_emails = num_of_spam_emails / ( num_of_spam_emails + num_of_ham_emails) self.all_words = Counter(join_spam_and_ham_words( spam_words, ham_words)) self.num_of_spam_words = len(spam_words) self.num_of_ham_words = len(ham_words) self.num_of_all_words = len(self.all_words)
def test(self, email_adress): global all_words, spam_words, probability_spam, count_spams, count_emails # part without train if probability_spam == 0: html_words = ['<html>', '<p>', '</a>', '<br>', '<head>', '<meta>', '<title>', '<body>'] fnames_with_body = Corpus(email_adress).emails() f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8") for fname in fnames_with_body: for word in html_words: if word in fname[1]: # if word there are in email's body -> It's SPAM! f.write(str(fname[0] + ' SPAM\n')) break f.write(str(fname[0] + ' OK\n')) # Else it's probably ham =\ f.close() # part with train else: fnames_with_body = Corpus(email_adress).emails() f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8") for fname in fnames_with_body: email_words = TrainingCorpus.get_words_from_email(fname[1]) probability_spam_words = [] for word in email_words: # skip empty words and about know nothing if (word not in all_words) or (word == ''): continue if word not in spam_words: probability_spam_word = 0 if word in spam_words: # Bayes' theorem. What is the probability that email is spam, if it has this word probability_spam_word = ( spam_words[word]/count_spams * probability_spam) / (all_words[word]/count_emails) probability_spam_words.append(probability_spam_word) # Final probability that email is spam probability_spam_email = sum(probability_spam_words)/len(probability_spam_words) *100 if probability_spam_email > 70: f.write(str(fname[0] + ' SPAM\n')) else: f.write(str(fname[0] + ' OK\n')) f.close()
def train(self, dir_path): corpus = TrainingCorpus(dir_path) for filt in self.strong_filters + self.normal_filters + self.word_filters: # print("Training " + filt.__class__.__name__) filt.train(corpus)
class TrainingCorpusTest(unittest.TestCase): def setUp(self): """Prepare fake corpus with !truth.txt file.""" self.email_dict = create_corpus_dictionary() self.true_class = create_classification_for(self.email_dict.keys()) create_corpus_dir_from_dictionary(self.email_dict) truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME) save_classification_to_file(self.true_class, fname=truth_filepath) with replaced_open(): self.tc = TrainingCorpus(CORPUS_DIR) def tearDown(self): delete_corpus_directory() def test_getClass(self): """Test the get_class method.""" for key, exp_class in self.true_class.items(): with replaced_open(): obs_class = self.tc.get_class(key) self.assertEqual( exp_class, obs_class, 'The expected class of email {} is {}, but {} was observed'. format(key, exp_class, obs_class)) def test_isSpam(self): """Test the is_spam method.""" for key, exp_class in self.true_class.items(): exp_spam = (exp_class == SPAM_TAG) with replaced_open(): obs_spam = self.tc.is_spam(key) self.assertEqual( exp_spam, obs_spam, 'The email {} spamminess: expected {}, observed {}.'.format( key, str(exp_spam), str(obs_spam))) def test_isHam(self): """Test the is_ham method.""" for key, exp_class in self.true_class.items(): exp_ham = (exp_class == HAM_TAG) with replaced_open(): obs_ham = self.tc.is_ham(key) self.assertEqual( exp_ham, obs_ham, 'The email {} hamminess: expected {}, observed {}.'.format( key, str(exp_ham), str(obs_ham))) def test_spams(self): """Test spams() method.""" obs_num_spams = 0 with replaced_open(): for fname, contents in self.tc.spams(): obs_num_spams += 1 # Validate results self.assertEqual(self.true_class[fname], SPAM_TAG, 'Non-spam email returned by spams() method.') self.assertEqual( self.email_dict[fname], contents, 'The read file contents are not equal to the expected contents.' ) c = Counter(self.true_class.values()) exp_num_spams = c[SPAM_TAG] self.assertEqual( exp_num_spams, obs_num_spams, 'The spams() method did not return the right number of spams.') def test_hams(self): """Test hams() method.""" obs_num_hams = 0 with replaced_open(): for fname, contents in self.tc.hams(): obs_num_hams += 1 # Validate results self.assertEqual(self.true_class[fname], HAM_TAG, 'Spam email returned by hams() method.') self.assertEqual( self.email_dict[fname], contents, 'The read file contents are not equal to the expected contents.' ) c = Counter(self.true_class.values()) exp_num_hams = c[HAM_TAG] self.assertEqual( exp_num_hams, obs_num_hams, 'The hams() method did not return the right number of hams.')
class TrainingCorpusTest(unittest.TestCase): def setUp(self): """Prepare fake corpus with !truth.txt file.""" self.email_dict = create_corpus_dictionary() self.true_class = create_classification_for(self.email_dict.keys()) create_corpus_dir_from_dictionary(self.email_dict) truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME) save_classification_to_file(self.true_class, fname=truth_filepath) self.tc = TrainingCorpus(CORPUS_DIR) def tearDown(self): delete_corpus_directory() def test_getClass(self): """Test the get_class method.""" for key, exp_class in self.true_class.items(): obs_class = self.tc.get_class(key) self.assertEqual( exp_class, obs_class, "The expected class of email {} is {}, but {} was observed".format(key, exp_class, obs_class), ) def test_isSpam(self): """Test the is_spam method.""" for key, exp_class in self.true_class.items(): exp_spam = exp_class == SPAM_TAG obs_spam = self.tc.is_spam(key) self.assertEqual( exp_spam, obs_spam, "The email {} spamminess: expected {}, observed {}.".format(key, str(exp_spam), str(obs_spam)), ) def test_isHam(self): """Test the is_ham method.""" for key, exp_class in self.true_class.items(): exp_ham = exp_class == HAM_TAG obs_ham = self.tc.is_ham(key) self.assertEqual( exp_ham, obs_ham, "The email {} hamminess: expected {}, observed {}.".format(key, str(exp_ham), str(obs_ham)), ) def test_spams(self): """Test spams() method.""" obs_num_spams = 0 for fname, contents in self.tc.spams(): obs_num_spams += 1 # Validate results self.assertEqual(self.true_class[fname], SPAM_TAG, "Non-spam email returned by spams() method.") self.assertEqual( self.email_dict[fname], contents, "The read file contents are not equal to the expected contents." ) c = Counter(self.true_class.values()) exp_num_spams = c[SPAM_TAG] self.assertEqual(exp_num_spams, obs_num_spams, "The spams() method did not return the right number of spams.") def test_hams(self): """Test hams() method.""" obs_num_hams = 0 for fname, contents in self.tc.hams(): obs_num_hams += 1 # Validate results self.assertEqual(self.true_class[fname], HAM_TAG, "Spam email returned by hams() method.") self.assertEqual( self.email_dict[fname], contents, "The read file contents are not equal to the expected contents." ) c = Counter(self.true_class.values()) exp_num_hams = c[HAM_TAG] self.assertEqual(exp_num_hams, obs_num_hams, "The hams() method did not return the right number of hams.")
def train(self, directory): trainer = TrainingCorpus(directory) self.spams = trainer.spams self.hams = trainer.hams self.trained = True
from basefilter import WordFilter from trainingcorpus import TrainingCorpus import inspect import wordfilters c = TrainingCorpus('./1') for name, obj in inspect.getmembers(wordfilters): if inspect.isclass(obj): if obj.__module__ == "wordfilters": a = obj() a.train(c) print(name, a.bayes_val)