def compute_quality_for_corpus(corpus_dir): truth = utils.read_classification_from_file(os.path.join(corpus_dir, '!truth.txt')) prediction = utils.read_classification_from_file(os.path.join(corpus_dir, '!prediction.txt')) mat = confmat.BinaryConfusionMatrix('SPAM', 'OK') mat.compute_from_dicts(truth, prediction) p = mat.as_dict() return quality_score(p['tp'], p['tn'], p['fp'], p['fn'])
def compute_quality_for_corpus(corpus_dir): path = os.getcwd() os.chdir(corpus_dir) truth_dict = read_classification_from_file('!truth.txt') pred_dict = read_classification_from_file('!prediction.txt') os.chdir(path) cm = BinaryConfusionMatrix(pos_tag='SPAM', neg_tag='OK') cm.compute_from_dicts(truth_dict, pred_dict) return quality_score(**cm.as_dict())
def compute_quality_for_corpus(corpus_dir): truth = utils.read_classification_from_file(corpus_dir + os.path.sep + "!truth.txt") predicted = utils.read_classification_from_file(corpus_dir + os.path.sep + "!prediction.txt") predMat = confmat.BinaryConfusionMatrix("SPAM", "OK") predMat.compute_from_dicts(truth, predicted) confusion_dict = predMat.as_dict() return quality_score(confusion_dict["tp"], confusion_dict["tn"], confusion_dict["fp"], confusion_dict["fn"])
def compute_quality_for_corpus(corpus_dir): """ Compute quality_score() for predictions in corpus """ matrix = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK") matrix.compute_from_dicts( dict(read_classification_from_file(os.path.join(corpus_dir, "!truth.txt"))), dict(read_classification_from_file(os.path.join(corpus_dir, "!prediction.txt"))) ) return quality_score(**matrix.as_dict())
def compute_quality_for_corpus(corpus_dir): truth_dict = utils.read_classification_from_file( os.path.join(corpus_dir, '!truth.txt')) prediction_dict = utils.read_classification_from_file( os.path.join(corpus_dir, '!prediction.txt')) confusion_matrix = BinaryConfusionMatrix(pos_tag, neg_tag) confusion_matrix.compute_from_dicts(truth_dict, prediction_dict) conf_dict = confusion_matrix.as_dict() return quality_score(**conf_dict)
def compute_quality_for_corpus(corpus_dir): truth_file_path = corpus_dir + '/!truth.txt' prediction_file_path = corpus_dir + '/!prediction.txt' truth_dict = utils.read_classification_from_file(truth_file_path) pred_dict = utils.read_classification_from_file(prediction_file_path) confusion_matrix = compute_confusion_matrix(truth_dict, pred_dict) tp = getattr(confusion_matrix, 'tp') tn = getattr(confusion_matrix, 'tn') fp = getattr(confusion_matrix, 'fp') fn = getattr(confusion_matrix, 'fn') quality = quality_score(tp, tn, fp, fn) return quality
def compute_quality_for_corpus(corpus_dir): """ Computes quality for given filter based on its prediction for given data :param corpus_dir: directory with emails and prediction :return: """ truth_dict = utils.read_classification_from_file(os.path.join(corpus_dir, '!truth.txt')) # read truth pred_dict = utils.read_classification_from_file(os.path.join(corpus_dir, '!prediction.txt')) # read prediction bcm = confmat.BinaryConfusionMatrix("SPAM", "OK") # create bin confusion matrix where SPAM is true and HAM is neg bcm.compute_from_dicts(truth_dict, pred_dict) # compute from dictionaries dic = bcm.as_dict() return quality_score(dic['tp'], dic['tn'], dic['fp'], dic['fn']) # get spam filter score
def compute_quality_for_corpus(corpus_dir, fn_weight=1, fp_weight=10, pos_tag="SPAM", neg_tag="OK"): matrix = confmat.BinaryConfusionMatrix(pos_tag, neg_tag) truth_dict = read_classification_from_file(corpus_dir + "/!truth.txt") pred_dict = read_classification_from_file(corpus_dir + "/!prediction.txt") matrix.compute_from_dicts(truth_dict, pred_dict) print( f"Fp: {matrix.fp}\nFn: {matrix.fn}\nTp: {matrix.tp}\nTn: {matrix.tn}") return (matrix.tp + matrix.tn) / ( fn_weight * matrix.fn + fp_weight * matrix.fp + matrix.tp + matrix.tn)
def compute_quality_for_corpus(corpus_dir): """ Compute quality_score() for predictions in corpus """ matrix = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK") matrix.compute_from_dicts( dict( read_classification_from_file( os.path.join(corpus_dir, "!truth.txt"))), dict( read_classification_from_file( os.path.join(corpus_dir, "!prediction.txt")))) return quality_score(**matrix.as_dict())
def compute_quality_for_corpus(corpus_dir): ''' Calculates quality for corpus :param corpus_dir: path to corpus :return: Number (0 - 1) representing quality ''' truth = utils.read_classification_from_file(corpus_dir + "/" + TRUTH_FILE) prediction = utils.read_classification_from_file(corpus_dir + "/" + PREDICTION_FILE) bcm = confmat.BinaryConfusionMatrix(pos_tag=SPAM_TAG, neg_tag=HAM_TAG) bcm.compute_from_dicts(truth_dict=truth, pred_dict=prediction) return quality_score(bcm.tp, bcm.tn, bcm.fp, bcm.fn)
def train(self, dir): classification = utils.read_classification_from_file(dir + "/!truth.txt") spam_total = 0 ham_total = 0 file_name_with_data = dict() for filename in os.listdir(dir): if filename[0] == "!": continue f = open(dir + "/" + filename, 'r', encoding="utf8") file_name_with_data.update({filename: f.read()}) for file_name, email_content in file_name_with_data.items(): cls = classification[file_name] if cls == "SPAM": spam_total += 1 else: ham_total += 1 for word in set(self.get_tokens(email_content)): if cls == "SPAM": self.spams[word] += 1 else: self.hams[word] += 1 spam_probability = spam_total / (spam_total + ham_total) ham_probability = 1 - spam_probability for word in (set(self.spams.keys()) | set(self.hams.keys())): self.spamicity[word] = (self.spams[word] / spam_total * spam_probability) / \ (self.spams[word] / spam_total * spam_probability + self.hams[ word] / ham_total * ham_probability)
def is_tag(self, ename, whichtag): rel_path = self.path_to_mails + '/' is_ham = read_classification_from_file(rel_path + TRUTHFILE) if (is_ham[ename] == whichtag): return True else: return False
def test_atom_filter(initialized_filter, train_dir, test_dir): train_corp = TrainingCorpus(train_dir) test_corp = Corpus(test_dir) filter = initialized_filter filter.train(train_corp) prediction = dict() for name, mail in test_corp.emails(): result = filter.test(mail) if result == -1: continue elif result > POSITIVITY_THRESHOLD: prediction[name] = POSITIVE else: prediction[name] = NEGATIVE truth = read_classification_from_file(test_dir + '/' + TRUTHFILE) conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE) conf_matrix.compute_from_dicts(truth, prediction) matrix_dict = conf_matrix.as_dict() # For testing purposes print(matrix_dict) score = quality_score(matrix_dict['tp'], \ matrix_dict['tn'], \ matrix_dict['fp'], \ matrix_dict['fn']) return score
def train( self, file_path, batch_size=10, learning_rate=0.1, lr_decay=0.05, epochs=1000, momentum=0.0, tuning=False ): # analogous to PLR_filter, tuning parameter plots out a graph of mean loss over epochs if tuning: og_lr = learning_rate # original learning rate x_plt = [] # x-axis (epochs) y_plt = [] # y-axis (mean loss) corpus = Corpus(file_path) truth_dict = utils.read_classification_from_file(file_path + "/!truth.txt") got_data = True mails_getter = corpus.emails() batches = [] while got_data: batch = [] for i in range(batch_size): try: email = next(mails_getter) batch.append( (email[1], 1 if truth_dict[email[0]] == self.pos_tag else 0)) except StopIteration: got_data = False break batches.append(batch) for e in range(epochs): if tuning: steps = 0 print(learning_rate) self.init_momentums() loss = 0 for batch in batches: batch_vectors = [(m[0].get_feature_vector_lr()) for m in batch] y = [m[1] for m in batch] loss += self.gradient_descent(y, batch_vectors, learning_rate, momentum) if tuning: steps += 1 print(f"trained on epoch #{e +1}") learning_rate *= 1 / (1 + lr_decay * e) if tuning: y_plt.append(loss / steps) x_plt.append(e) if tuning: plt.plot(x_plt, y_plt) plt.title( f"lr:{og_lr} lrd:{lr_decay} bs:{batch_size} m: {momentum} e:{epochs}" ) plt.xlabel("epochs") plt.ylabel("mean loss") plt.show()
def count_spams(email_adress): truth = read_classification_from_file(str(email_adress + '/!truth.txt')) count_spam = 0 for email in truth: if truth[email] == 'SPAM': count_spam += 1 return count_spam
def is_ham(self, email_soubor): truth = read_classification_from_file( str(self.email_adress + '/!truth.txt')) for email in truth: if email == email_soubor and truth[email] == 'SPAM': return False if email == email_soubor and truth[email] == 'OK': return True
def compute_quality_for_corpus(corpus_dir): truth_clasf = read_classification_from_file(corpus_dir + '/' + TRUTHFILE) pred_clasf = read_classification_from_file(corpus_dir + '/' + PREDFILE) conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE) conf_matrix.compute_from_dicts(truth_clasf, pred_clasf) matrix_dict = conf_matrix.as_dict() # Testing purposes print(matrix_dict) score = quality_score(matrix_dict['tp'], \ matrix_dict['tn'], \ matrix_dict['fp'], \ matrix_dict['fn']) return score
def assertPredictionFileExistsAndContainsClassificationFor(self, expected): fpath = os.path.join(CORPUS_DIR, PREDICTION_FILENAME) self.assertTrue(os.path.isfile(fpath), "The test() method did not create the !prediction.txt file.") observed = read_classification_from_file(fpath) self.assertEqual( sorted(expected.keys()), sorted(observed.keys()), 'The !prediction.txt file does not contain decisions for the files it should.') self.assertTrue( all(value in (SPAM_TAG,HAM_TAG) for value in observed.values()))
def test_correctlyFormattedFile(self): # Prepare fixture expected = create_classification() save_classification_to_file(expected, FILENAME) # Exercise the SUT observed = read_classification_from_file(FILENAME) # Validate results self.assertDictEqual( expected, observed, 'The read file contents are not equal to the expected contents.')
def test_returnEmptyDict_forEmptyFile(self): # Prepare fixture expected = dict() save_classification_to_file(expected, FILENAME) # Excercise the SUT observed = read_classification_from_file(FILENAME) # Validate results self.assertDictEqual( expected, observed, 'The read dictionary shall be empty for empty file.')
def test_correctlyFormattedFile(self): # Prepare fixture expected = create_classification() save_classification_to_file(expected, FILENAME) # Exercise the SUT with replaced_open(): observed = read_classification_from_file(FILENAME) # Validate results self.assertDictEqual( expected, observed, 'The read file contents are not equal to the expected contents.')
def compute_quality_for_corpus(corpus_dir): dirs = os.listdir(corpus_dir) for file in dirs: if '!' in file: if file == "!truth.txt": truth_dict = read_classification_from_file(corpus_dir + '/' + file) elif file == "!prediction.txt": pred_dict = read_classification_from_file(corpus_dir + '/' + file) else: truth_dict = None pred_dict = None else: pass cm1 = BinaryConfusionMatrix(pos_tag="SPAM", neg_tag="OK") cm1.compute_from_dicts(truth_dict, pred_dict) final_dict = cm1.as_dict() return quality_score(final_dict['tp'], final_dict['tn'], final_dict['fp'], final_dict['fn'])
def train(self, train_corpus_dir): """ Train the corpus on given emails dataset :param train_corpus_dir: """ self.truth = utils.read_classification_from_file( train_corpus_dir) # load truth train_corpus = Corpus(train_corpus_dir) self.get_SPAM_percentage(train_corpus) # not in use now for val in email_keys: # get values for parts of email header self.classify_part(train_corpus, val.lower()) self.classify_payload( train_corpus) # get values for email payload/body
def test_returnEmptyDict_forEmptyFile(self): # Prepare fixture expected = dict() save_classification_to_file(expected, FILENAME) # Excercise the SUT with replaced_open(): # Insist on explicit use of encoding observed = read_classification_from_file(FILENAME) # Validate results self.assertDictEqual( expected, observed, 'The read dictionary shall be empty for empty file.')
def train(self, file_path, batch_size=10, learning_rate=0.1, lr_decay=0.05, epochs=1000, momentum=0.0): corpus = Corpus(file_path) truth_dict = utils.read_classification_from_file(file_path + "/!truth.txt") got_data = True mails_getter = corpus.emails() batches = [] # loads all data from directory in batches of given size while got_data: batch = [] # loads a batch of given size, a smaller one if out of data for i in range(batch_size): try: email = next(mails_getter) batch.append( (email[1], 1 if truth_dict[email[0]] == self.pos_tag else 0)) except StopIteration: got_data = False break batches.append(batch) for e in range(epochs): # trains multiple times on all batches self.init_momentums() for batch in batches: # performs gradient descent on each bach # gets feature vectors for batch feature_vectors = [ (m[0].get_feature_vector_plr()) for m in batch ] # gets feature vectors of the batch y = [m[1] for m in batch] # gets the truth vector of the batch for i in range( self.subvector_count ): # weights for each subvector are trained separately subvector_batch = [ v[i] for v in feature_vectors ] # isolates a subvector from all vectors self.gradient_descent(i, y, subvector_batch, learning_rate, momentum) print(f"trained on epoch #{e +1}") learning_rate *= 1 / (1 + lr_decay * e)
def train(self, path): """ Trains the corpus on given emails dataset :param path: directory with emails """ self.truth_dict = read_classification_from_file(path + "/!truth.txt") emails = TrainingCorpus(path) spam_words, num_of_spam_emails = self.list_spam_ham_words(emails, True) ham_words, num_of_ham_emails = self.list_spam_ham_words(emails, False) self.portion_of_spam_emails = num_of_spam_emails / ( num_of_spam_emails + num_of_ham_emails) self.all_words = Counter(join_spam_and_ham_words( spam_words, ham_words)) self.num_of_spam_words = len(spam_words) self.num_of_ham_words = len(ham_words) self.num_of_all_words = len(self.all_words)
def train(self, file_path): self.content_spam_dict = {} self.content_ham_dict = {} class_dict = utils.read_classification_from_file(file_path + '/!truth.txt') corpus = Corpus(file_path) email_generator = corpus.emails() content_counter_spam = Counter() content_counter_ham = Counter() content_wordcount_spam = 0 content_wordcount_ham = 0 spam_count = 0 ham_count = 0 every_word_content = set() for mail in email_generator: content_words = self.string_to_words(mail[1].content_no_html) content_counter = Counter(content_words) for word in content_words: every_word_content.add(word) if class_dict[mail[0]] == self.pos_tag: spam_count += 1 content_counter_spam += content_counter content_wordcount_spam += len(content_words) else: ham_count += 1 content_counter_ham += content_counter content_wordcount_ham += len(content_words) for word in every_word_content: content_counter_ham[word] += 1 content_counter_spam[word] += 1 self.content_spam_dict[ word] = content_counter_spam[word] / content_wordcount_spam self.content_ham_dict[ word] = content_counter_ham[word] / content_wordcount_ham self.spam_probability = spam_count / (spam_count + ham_count) self.ham_probability = ham_count / (spam_count + ham_count) self.trained = True
def train(self, train_dir): self.train_files_dict = read_classification_from_file(train_dir + '/!truth.txt') total_emails = len(self.train_files_dict) for file in self.train_files_dict: train_file_path = train_dir train_file_path += '/' + file mail = self.get_email(train_file_path) mail_words = self.get_email_message(mail) mail_unique_words = set(mail_words) """Counting spam and ham word appearances""" if self.train_files_dict[file] == self.decision_table[1]: self.spam_words_counter.update(mail_words) self.total_spam_emails += 1 else: self.ham_words_counter.update(mail_words) self.words_counter.update(mail_words) self.vocabulary.update(mail_unique_words) self.total_ham_emails = total_emails - self.total_spam_emails """Computing the probability that a message containing a given word is spam.""" for word in self.vocabulary: if self.ham_words_counter.get( word, 0) == 0 and self.spam_words_counter.get(word, 0) > 0: self.word_spaminess[word] = 0.99 elif self.ham_words_counter.get( word, 0) > 0 and self.spam_words_counter.get(word, 0) == 0: self.word_spaminess[word] = 0.01 else: spam_likelihood = self.spam_words_counter.get( word, 0) / self.total_spam_emails ham_likelihood = self.ham_words_counter.get( word, 0) / self.total_ham_emails self.word_spaminess[word] = max( spam_likelihood / (spam_likelihood + ham_likelihood), 0.01)
def get_class(self, filename): dic = read_classification_from_file(self.path + TRUTH) return dic[filename]
def __init__(self, path_to_train): self.path_to_train = path_to_train self.path_to_truth = os.path.join(path_to_train, '!truth.txt') self.truth_dic = utils.read_classification_from_file(self.path_to_truth)
def train(self, training_corpus_path): self.dictionary = read_classification_from_file(training_corpus_path + '/!truth.txt') self.dictionary.fromkeys(self.dictionary, self.table[1])
def train(self, training_corpus_path): self.dictionary = read_classification_from_file(training_corpus_path + '/!truth.txt') self.dictionary = {x: choice(self.table) for x in self.dictionary}
def __init__(self, path): Corpus.__init__(self, path) self.path = path self.truth_dict = read_classification_from_file(self.path + "/!truth.txt")
def get_class(self, filename): dic = read_classification_from_file(self.path+TRUTH) return dic[filename]
def train(self, path_to_training_corpus): self.trained_data_dict = utils.read_classification_from_file(path_to_training_corpus)
def get_truth_class(self): if (not os.path.isfile(os.path.join(self.path,TRUTH))): raise FileExistsError("File " + os.path.join(self.path,TRUTH) + " does not exists!") emails_class = utils.read_classification_from_file(os.path.join(self.path,TRUTH)) for name, body in self.emails(): yield (body,emails_class[name])
f.write(part + " " + key + " " + str(self.classification[part][key]) + "\n") def convert(file, out): """ I used this to convert the truth file for emails I found on internet, where the SPAM or HAM was before file name :param file: :param out: """ dic = {} with open(file, 'r', encoding="utf-8") as f: for line in f.readlines(): key, val = line.split() dic[val] = key with open(out, 'w', encoding="utf-8") as f: for key in dic: f.write(key + " " + dic[key] + "\n") if __name__ == "__main__": # used for testing and debugging filter = MyFilter() filter.test("SPAM-data/2/") confusion_matrix = BinaryConfusionMatrix('SPAM', 'OK') confusion_matrix.compute_from_dicts( utils.read_classification_from_file("SPAM-data/2/"), filter.predictions) print("Quality: %.2f%%" % (confusion_matrix.quality_score() * 100)) utils.clean_up("SPAM-data/2/") # clean !truth