Example #1
0
def compute_quality_for_corpus(corpus_dir):
        truth_dic = methods.read_classification_from_file(methods.add_slash(corpus_dir)+"!truth.txt")
        pred_dic = methods.read_classification_from_file(methods.add_slash(corpus_dir)+"!prediction.txt")
        bc1 = BinaryConfusionMatrix('SPAM', 'OK')
        bc1.compute_from_dicts(truth_dic, pred_dic)
        dict_score = bc1.as_dict()
        fn=dict_score['fn']
        tn=dict_score['tn']
        fp=dict_score['fp']
        tp=dict_score['tp']
        return quality_score(tp, tn, fp, fn), tp, tn, fp, fn
Example #2
0
        def train(self,path_to_truth_dir):
                corpus = Corpus(path_to_truth_dir)
                #Read truth file
                truth = methods.read_classification_from_file(methods.add_slash(path_to_truth_dir)+"!truth.txt")
                #Make truth global
                self.truth = truth
                for fname, body in corpus.emails_as_string():
                        email_as_file = open(methods.add_slash(path_to_truth_dir) + fname,'r',encoding = 'utf-8')
                        #Read email with EMAIL parser
                        msg = email.message_from_file(email_as_file)
                        self.extract_senders_list(msg,fname)
                        self.check_subject(msg,fname)

                #Generate dict's
                methods.generate_file_from_dict(self.path_bl , self.black_list)
                methods.generate_file_from_dict(self.path_wl ,self.white_list)
                methods.generate_file_from_dict(self.path_ssl , self.spam_subject_list)
                methods.generate_file_from_dict(self.path_hsl ,self.ham_subject_list)
Example #3
0
        def test(self, path_to_test_dir):
                predictions = {} #Predictions dict {fname:prediction}
                bs = Bayesian.Bayesian()
                corpus = Corpus(path_to_test_dir)
                #Read dict's (if test called before train)
                black_list_dict = methods.read_dict_from_file(self.path_bl)
                white_list_dict = methods.read_dict_from_file(self.path_wl)
                spam_subject_dict = methods.read_dict_from_file(self.path_ssl)
                ham_subject_dict = methods.read_dict_from_file(self.path_hsl)
                
                for fname, body in corpus.emails_as_string():
                        #Open email with parser
                        email_as_file = open(methods.add_slash(path_to_test_dir) + fname,'r',encoding = 'utf-8')
                        msg = email.message_from_file(email_as_file)

                        #Check if sender in a black list
                        if (self.extract_email_adress_from_text(msg['From']) in black_list_dict):
                                predictions[fname] = 'SPAM'
                        elif(self.extract_email_adress_from_text(msg['From']) in white_list_dict):
                        #Check if sender in a white list
                                predictions[fname] = 'OK'
                        #Check if subject in a black list
                        elif(self.extract_email_adress_from_text(msg['From']) in spam_subject_dict):
                             prediction[fname] = 'SPAM'
                        #Check if subject in a white list
                        elif(self.extract_email_adress_from_text(msg['From']) in ham_subject_dict):
                                prediction[fname] = 'OK'
                        #Run Bayesian checker
                        else:                
                                if (bs.bayesian_prediction(methods.get_text(msg))) > 0.485:
                                        predictions[fname] = 'SPAM'
                                else:
                                        predictions[fname] = 'OK'

                #Generate prediction file
                bf = BaseFilter(path_to_test_dir,predictions)
                bf.generate_prediction_file()
Example #4
0
 def generate_prediction_file(self):
     path_to_prediction_file = methods.add_slash(self.path_to_dir)+'!prediction.txt'
     prediction_file = open(path_to_prediction_file, 'w+')
     for i in self.pred_dict.keys():
         prediction_file.write(i + " " + self.pred_dict[i] + "\n")
 def hams_as_string():
         for file_name in os.listdir(self.path_to_dir):
                 if not file_name.startswith("!"):
                         if (is_ham(file_name)):
                                 with io.open(methods.add_slash(self.path_to_dir)+file_name,'r', encoding ='utf-8') as body:
                                         yield[file_name,body.read()]
Example #6
0
 def emails_as_string(self):
         for file_name in os.listdir(self.path_to_dir):
                 if not file_name.startswith("!"):                                
                         with io.open(methods.add_slash(self.path_to_dir)+file_name,'r', encoding ='utf-8', errors='ignore') as body:
                                 yield[file_name,body.read()]