def compute_quality_for_corpus(corpus_dir): truth_dic = methods.read_classification_from_file(methods.add_slash(corpus_dir)+"!truth.txt") pred_dic = methods.read_classification_from_file(methods.add_slash(corpus_dir)+"!prediction.txt") bc1 = BinaryConfusionMatrix('SPAM', 'OK') bc1.compute_from_dicts(truth_dic, pred_dic) dict_score = bc1.as_dict() fn=dict_score['fn'] tn=dict_score['tn'] fp=dict_score['fp'] tp=dict_score['tp'] return quality_score(tp, tn, fp, fn), tp, tn, fp, fn
def train(self,path_to_truth_dir): corpus = Corpus(path_to_truth_dir) #Read truth file truth = methods.read_classification_from_file(methods.add_slash(path_to_truth_dir)+"!truth.txt") #Make truth global self.truth = truth for fname, body in corpus.emails_as_string(): email_as_file = open(methods.add_slash(path_to_truth_dir) + fname,'r',encoding = 'utf-8') #Read email with EMAIL parser msg = email.message_from_file(email_as_file) self.extract_senders_list(msg,fname) self.check_subject(msg,fname) #Generate dict's methods.generate_file_from_dict(self.path_bl , self.black_list) methods.generate_file_from_dict(self.path_wl ,self.white_list) methods.generate_file_from_dict(self.path_ssl , self.spam_subject_list) methods.generate_file_from_dict(self.path_hsl ,self.ham_subject_list)
def test(self, path_to_test_dir): predictions = {} #Predictions dict {fname:prediction} bs = Bayesian.Bayesian() corpus = Corpus(path_to_test_dir) #Read dict's (if test called before train) black_list_dict = methods.read_dict_from_file(self.path_bl) white_list_dict = methods.read_dict_from_file(self.path_wl) spam_subject_dict = methods.read_dict_from_file(self.path_ssl) ham_subject_dict = methods.read_dict_from_file(self.path_hsl) for fname, body in corpus.emails_as_string(): #Open email with parser email_as_file = open(methods.add_slash(path_to_test_dir) + fname,'r',encoding = 'utf-8') msg = email.message_from_file(email_as_file) #Check if sender in a black list if (self.extract_email_adress_from_text(msg['From']) in black_list_dict): predictions[fname] = 'SPAM' elif(self.extract_email_adress_from_text(msg['From']) in white_list_dict): #Check if sender in a white list predictions[fname] = 'OK' #Check if subject in a black list elif(self.extract_email_adress_from_text(msg['From']) in spam_subject_dict): prediction[fname] = 'SPAM' #Check if subject in a white list elif(self.extract_email_adress_from_text(msg['From']) in ham_subject_dict): prediction[fname] = 'OK' #Run Bayesian checker else: if (bs.bayesian_prediction(methods.get_text(msg))) > 0.485: predictions[fname] = 'SPAM' else: predictions[fname] = 'OK' #Generate prediction file bf = BaseFilter(path_to_test_dir,predictions) bf.generate_prediction_file()
def generate_prediction_file(self): path_to_prediction_file = methods.add_slash(self.path_to_dir)+'!prediction.txt' prediction_file = open(path_to_prediction_file, 'w+') for i in self.pred_dict.keys(): prediction_file.write(i + " " + self.pred_dict[i] + "\n")
def hams_as_string(): for file_name in os.listdir(self.path_to_dir): if not file_name.startswith("!"): if (is_ham(file_name)): with io.open(methods.add_slash(self.path_to_dir)+file_name,'r', encoding ='utf-8') as body: yield[file_name,body.read()]
def emails_as_string(self): for file_name in os.listdir(self.path_to_dir): if not file_name.startswith("!"): with io.open(methods.add_slash(self.path_to_dir)+file_name,'r', encoding ='utf-8', errors='ignore') as body: yield[file_name,body.read()]