def setUp(self): self.subject = "dummySubject" self.body = "dummyBody" self.analyzer = EmailAnalyzer() self.clean_subject = [ "best", "quick", "netco" ] # données pour mocker "return_value" du "clean_text" self.clean_body = [ "prescription", "drug", "overview", "operations" ] # données pour mocker "return_value" du "clean_text" self.spam_ham_body_prob_true = ( 0, 0, ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_true = ( 0, 0, ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.spam_ham_body_prob_false = ( 0, 0, ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_false = ( 0, 0, ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.vocab = ( { "spam_sub": { "best": 1 / 4, "online": 1 / 4, "medicine": 1 / 4, "here": 1 / 4, }, "ham_sub": { "netco": 1 / 3, "due": 1 / 3, "diligence": 1 / 3 }, "spam_body": { "prescription": 1 / 5, "drug": 1 / 5, "simple": 1 / 5, "quick": 1 / 5, "affordable": 1 / 5 }, "ham_body": { "big": 1 / 6, "pig": 1 / 6, "met": 1 / 6, "today": 1 / 6, "overview": 1 / 6, "operations": 1 / 6 } } ) # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict" self.spam_ham_body_prob_expected = 0, 0 # valeurs de la probabilité attendus self.subject_spam_ham_prob_expected = 0, 0 # valeurs de la probabilité attendus
def test_body_spam_ham_prob_Returns_expected_probability( self, mock_load_vocab): """ Il faut mocker la fonction "load_dict" Il faut vérifier que probabilité est calculée correctement donné le "body" à l'entrée (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ email_analyzer = EmailAnalyzer() mock_load_vocab.return_value = self.vocab self.assertEqual(email_analyzer.body_spam_ham_prob(self.body_true), self.body_spam_ham_prob_expected)
def test_is_spam_Returns_True_if_spam_prob_is_higher( self, mock_subject_spam_ham_prob, mock_body_spam_ham_prob, mock_clean_text): """ Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob". La sortie de la fonction doit être True si probabilité spam > probabilité ham (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ mock_clean_text.return_value = self.clean_subject_true mock_subject_spam_ham_prob.return_value = self.subject_spam_ham_prob_true mock_clean_text.return_value = self.clean_body_true mock_body_spam_ham_prob.return_value = self.body_spam_ham_prob_true email_analyzer = EmailAnalyzer() self.assertEqual( email_analyzer.is_spam(self.subject_true, self.body_true), True)
def evaluate(): tp = 0 tn = 0 fp = 0 fn = 0 total = 0 analyzer = EmailAnalyzer() with open("200-mails.json") as email_file: new_emails = json.load(email_file) counter = 0 for e_mail in new_emails["dataset"]: counter += 1 print(counter) new_email = e_mail["mail"] subject = new_email["Subject"] body = new_email["Body"] spam = new_email["Spam"] if ((analyzer.is_spam(subject, body))) and (spam == "true"): tp += 1 if (not (analyzer.is_spam(subject, body))) and (spam == "false"): tn += 1 if ((analyzer.is_spam(subject, body))) and (spam == "false"): fp += 1 if (not (analyzer.is_spam(subject, body))) and (spam == "true"): fn += 1 total += 1 print("Accuracy: ", (tp + tn) / (tp + tn + fp + fn)) print("Precision: ", tp / (tp + fp)) print("Recall: ", tp / (tp + fn)) return True
def evaluate(is_log_estimation, is_log_combination, clean_text_mode, k): tp = 0 tn = 0 fp = 0 fn = 0 total = 0 analyzer = EmailAnalyzer() with open("test_set.json") as email_file: new_emails = json.load(email_file) i = 0 email_count = len(new_emails["dataset"]) print("Evaluating emails ") for e_mail in new_emails["dataset"]: i += 1 print("\rEmail " + str(i) + "/" + str(email_count), end="") new_email = e_mail["mail"] subject = new_email["Subject"] body = new_email["Body"] spam = new_email["Spam"] if ((analyzer.is_spam(subject, body, is_log_estimation, is_log_combination, clean_text_mode, k))) and (spam == "true"): tp += 1 if (not (analyzer.is_spam(subject, body, is_log_estimation, is_log_combination, clean_text_mode, k))) and (spam == "false"): tn += 1 if ((analyzer.is_spam(subject, body, is_log_estimation, is_log_combination, clean_text_mode, k))) and (spam == "false"): fp += 1 if (not (analyzer.is_spam(subject, body, is_log_estimation, is_log_combination, clean_text_mode, k))) and (spam == "true"): fn += 1 total += 1 print("") print("\nAccuracy: ", round((tp + tn) / (tp + tn + fp + fn), 2)) if (tp + fp == 0): print("Precision: ", 0) else: print("Precision: ", round(tp / (tp + fp), 2)) print("Recall: ", round(tp / (tp + fn), 2)) return True
def evaluate(is_log_estimation, is_log_combo, calculation_mode): tp = 0 tn = 0 fp = 0 fn = 0 total = 0 analyzer = EmailAnalyzer() with open("200-mails.json") as email_file: new_emails = json.load(email_file) counter = 0 for e_mail in new_emails["dataset"]: counter += 1 print(counter) new_email = e_mail["mail"] subject = new_email["Subject"] body = new_email["Body"] spam = new_email["Spam"] if ((analyzer.is_spam(subject, body, is_log_estimation, False, 0))) and (spam == "true"): tp += 1 if (not (analyzer.is_spam(subject, body, False, False, 0))) and (spam == "false"): tn += 1 if ((analyzer.is_spam(subject, body, False, False, 0))) and (spam == "false"): fp += 1 if (not (analyzer.is_spam(subject, body, False, False, 0))) and (spam == "true"): fn += 1 total += 1 accuracy = (tp + tn) / (tp + tn + fp + fn) precision = tp / (tp + fp) recall = tp / (tp + fn) print("Accuracy: ", accuracy) print("Precision: ", tp / (tp + fp)) print("Recall: ", tp / (tp + fn)) return accuracy, precision, recall
def evaluate(estimation_option, combination_option): tp = 0 tn = 0 fp = 0 fn = 0 total = 0 analyzer = EmailAnalyzer() with open("test-emails.json") as email_file: new_emails = json.load(email_file) for e_mail in new_emails["dataset"]: new_email = e_mail["mail"] subject = new_email["Subject"] body = new_email["Body"] spam = new_email["Spam"] if (analyzer.is_spam_with_params(subject, body, estimation_option, combination_option, cleaning_mode, 0.3)) and (spam == "true"): tp += 1 if (not (analyzer.is_spam_with_params( subject, body, estimation_option, combination_option, cleaning_mode, 0.3))) and (spam == "false"): tn += 1 if (analyzer.is_spam_with_params(subject, body, estimation_option, combination_option, cleaning_mode, 0.3)) and (spam == "false"): fp += 1 if (not (analyzer.is_spam_with_params( subject, body, estimation_option, combination_option, cleaning_mode, 0.3))) and (spam == "true"): fn += 1 total += 1 print("Accuracy: ", (tp + tn) / (tp + tn + fp + fn)) print("Precision: ", tp / (tp + fp)) print("Recall: ", tp / (tp + fn)) return True
def test_is_spam_function_two_returns_true_vns_test_four(self): return_val = EmailAnalyzer.is_spam_function_two(False, 20, 65) self.assertTrue(return_val)
def test_is_spam_function_two_returns_true_vns_test_three(self): return_val = EmailAnalyzer.is_spam_function_two(True, 20, 80) self.assertTrue(return_val)
def test_is_spam_function_two_returns_false_pic_test_one(self): return_val = EmailAnalyzer.is_spam_function_two(False, 80, 65) self.assertFalse(return_val)
def test_is_spam_function_one_returns_false_icc_test_twenty(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 80, 60) self.assertTrue(return_val)
def test_is_spam_function_one_returns_true_icc_test_nineteen(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 60) self.assertTrue(return_val)
def test_is_spam_function_one_returns_false_icc_test_seventeen(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80) self.assertFalse(return_val)
class TestEmailAnalyzer(unittest.TestCase): def setUp(self): self.analyzer = EmailAnalyzer() self.subject_true = " no more outdated software ! upgrade !" self.body_true = "we get you the best deal ! skip the retail box and save !\namazing special # 1 :\nadobe - photoshop 7 premiere 7 illustrator 10 = only $ 120\namazing special # 2 :\nwindows xp professional + microsoft office xp professional = only $ 80\namazing special # 3 :\nadobe photoshop cs + adobe illustrator cs + adobe indesign cs\namazing special # 4 :\n" self.clean_subject_true = [ 'more', 'oudat', 'software', 'upgrade' ] # données pour mocker "return_value" du "clean_text" self.clean_body_true = [ 'get', 'best', 'deal', 'skip', 'retail', 'box', 'sav', 'amaz', 'special', 'adobe', 'photoshop', 'premiere', 'illustrator', 'only', 'windows', 'xp', 'professional', 'microsoft', 'office', 'cs', 'indesign' ] # données pour mocker "return_value" du "clean_text" self.subject_false = "re :" self.body_false = "we are using it for other things . mary joyce and robert have discussed with mcmahon and bowen .\n- - - - - original message - - - - -\nfrom : kitchen louise\nsent : monday december 10 2001 8 : 26 am\nto : oxley david\nsubject :\nwhat happens to the money in wachovia ?\nlouise kitchen\nchief operating \n" self.clean_subject_false = [ 're' ] # données pour mocker "return_value" du "clean_text" self.clean_body_false = [ 'us', 'other', 'thing', 'mary', 'joyce', 'robert', 'discuss', 'mcmahon', 'bowen', 'original', 'message', 'kitchen', 'louise', 'sent', 'monday', "december", 'oxley', 'david', 'subject', 'happen', 'money', 'wachovia' ] self.spam_ham_body_prob_true = ( 1, (1 / 6), ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_true = ( (2 / 3), (1 / 6), ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.spam_ham_body_prob_false = ( (1 / 4), (2 / 6), ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_false = ( 0, (1 / 2), ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.vocab = ( { "p_sub_spam": { "upgrade": 1 / 3, "software": 1 / 3 }, "p_sub_ham": { "re": 1 / 2, "annoucement": 1 / 6, "more": 1 / 6 }, "p_body_spam": { "best": 1 / 4, "deal": 1 / 4, "skip": 1 / 4, "special": 1 / 4, "money": 1 / 4 }, "p_body_ham": { "today": 1 / 6, "professional": 1 / 6, "meet": 1 / 6, "discuss": 1 / 6, "sent": 1 / 6 } } ) # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict" # valeurs de la probabilité attendus : (0.5925*1/(256*pow(6,17))), (0.4075*1/pow(6,21)) self.spam_ham_body_prob_expected = (1.3673419333309543e-16, 1.8575963755415577e-17) # valeurs de la probabilité attendus : (0.5925*1/81, 0.4075*1/6*1/4*1/4*1/4) self.subject_spam_ham_prob_expected = (0.007314814814814815, 0.0010611979166666665) def tearDown(self): pass @patch("email_analyzer.EmailAnalyzer.clean_text") @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob") @patch("email_analyzer.EmailAnalyzer.spam_ham_subject_prob") def test_is_spam_Returns_True_if_spam_prob_is_higher( self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob, mock_clean_text): mock_subject_spam_ham_prob.return_value = self.subject_spam_ham_prob_true mock_spam_ham_body_prob.return_value = self.spam_ham_body_prob_true return_val = self.analyzer.is_spam(self.subject_true, self.body_true) self.assertTrue(return_val) """ Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob". La sortie de la fonction doit être True si probabilité spam > probabilité ham (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ @patch("email_analyzer.EmailAnalyzer.clean_text") @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob") @patch("email_analyzer.EmailAnalyzer.spam_ham_subject_prob") def test_is_spam_Returns_False_if_spam_prob_is_lower( self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob, mock_clean_text): mock_subject_spam_ham_prob.return_value = self.subject_spam_ham_prob_false mock_spam_ham_body_prob.return_value = self.spam_ham_body_prob_false return_val = self.analyzer.is_spam(self.subject_false, self.body_false) self.assertFalse(return_val) """ Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob". La sortie de la fonction doit être False si probabilité spam probabilité ham (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ @patch("email_analyzer.EmailAnalyzer.load_dict") def test_spam_ham_body_prob_Returns_expected_probability( self, mock_load_dict): mock_load_dict.return_value = self.vocab self.assertEqual( self.analyzer.spam_ham_body_prob(self.clean_body_true), self.spam_ham_body_prob_expected) """ Il faut mocker la fonction "load_dict" Il faut vérifier que probabilité est calculée correctement donné le "body" à l'entrée (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ @patch("email_analyzer.EmailAnalyzer.load_dict") def test_subject_spam_ham_prob_Returns_expected_probability( self, mock_load_dict): mock_load_dict.return_value = self.vocab self.assertEqual( self.analyzer.spam_ham_subject_prob(self.clean_subject_true), self.subject_spam_ham_prob_expected) """
def test_is_spam_function_one_returns_false_acc_test_eight(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 80) self.assertFalse(return_val)
def test_is_spam_function_one_returns_true_acc_test_five(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 80) self.assertTrue(return_val)
class TestEmailAnalyzer(unittest.TestCase): def setUp(self): self.subject = "dummySubject" self.body = "dummyBody" self.analyzer = EmailAnalyzer() self.clean_subject = ["best", "quick", "netco"] # données pour mocker "return_value" du "clean_text" self.clean_body = ["prescription", "drug", "overview", "operations"] # données pour mocker "return_value" du "clean_text" self.spam_ham_body_prob_true = ( 0, 0, ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_true = ( 0, 0, ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.spam_ham_body_prob_false = ( 0, 0, ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_false = ( 0, 0, ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.vocab = ( { "spam_sub": { "best": 1 / 4, "online": 1 / 4, "medicine": 1 / 4, "here": 1 / 4, }, "ham_sub": { "netco": 1 / 3, "due": 1 / 3, "diligence": 1 / 3 }, "spam_body": { "prescription": 1 / 5, "drug": 1 / 5, "simple": 1 / 5, "quick": 1 / 5, "affordable": 1 / 5 }, "ham_body": { "big": 1 / 6, "pig": 1 / 6, "met": 1 / 6, "today": 1 / 6, "overview": 1 / 6, "operations": 1 / 6 } } ) # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict" self.spam_ham_body_prob_expected = 0, 0 # valeurs de la probabilité attendus self.subject_spam_ham_prob_expected = 0, 0 # valeurs de la probabilité attendus def tearDown(self): pass ### Tests pour l'Active clause coverage def test_is_spam_function_one_returns_true_acc_test_one(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 60) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_acc_test_two(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_acc_test_three(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 50, 50) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_acc_test_four(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 50, 50) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_acc_test_five(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 80) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_acc_test_six(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 70, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_acc_test_seven(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 50) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_acc_test_eight(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_false_acc_test_nine(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 76, 50) self.assertFalse(return_val) # Tests pour l'Inactive Clause Coverage # P est clause majeure: def test_is_spam_function_one_returns_false_icc_test_one(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_false_icc_test_two(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_icc_test_three(self): return_val = EmailAnalyzer.is_spam_function_one(False, 15, 65, 50) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_icc_test_four(self): return_val = EmailAnalyzer.is_spam_function_one(True, 15, 65, 50) self.assertTrue(return_val) # H est clause majeure def test_is_spam_function_one_returns_true_icc_test_five(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 65, 60) self.assertTrue(return_val) def test_is_spam_function_one_returns_true_icc_test_six(self): return_val = EmailAnalyzer.is_spam_function_one(True, 15, 65, 60) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_icc_test_seven(self): return_val = EmailAnalyzer.is_spam_function_one(False, 40, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_false_icc_test_eight(self): return_val = EmailAnalyzer.is_spam_function_one(False, 15, 65, 80) self.assertFalse(return_val) # T1 clause majeure def test_is_spam_function_one_returns_false_icc_test_nine(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_false_icc_test_ten(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 50, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_icc_test_eleven(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 50, 60) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_icc_test_twelve(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 65, 60) self.assertTrue(return_val) # T2 clause majeure def test_is_spam_function_one_returns_false_icc_test_thirteen(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_false_icc_test_fourteen(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 50) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_icc_test_fifteen(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 80) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_icc_test_sixteen(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 50) self.assertTrue(return_val) # T3 clause majeure def test_is_spam_function_one_returns_false_icc_test_seventeen(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_false_icc_test_eighteen(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 80, 80) self.assertFalse(return_val) def test_is_spam_function_one_returns_true_icc_test_nineteen(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 60) self.assertTrue(return_val) def test_is_spam_function_one_returns_false_icc_test_twenty(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 80, 60) self.assertTrue(return_val) # Critère IC def test_is_spam_function_two_returns_false_ic_test_one(self): return_val = EmailAnalyzer.is_spam_function_two(False, 80, 80) self.assertFalse(return_val) def test_is_spam_function_two_returns_true_ic_test_two(self): return_val = EmailAnalyzer.is_spam_function_two(True, 20, 65) self.assertTrue(return_val) # Critère PIC def test_is_spam_function_two_returns_false_pic_test_one(self): return_val = EmailAnalyzer.is_spam_function_two(False, 80, 65) self.assertFalse(return_val) def test_is_spam_function_two_returns_false_pic_test_two(self): return_val = EmailAnalyzer.is_spam_function_two(False, 20, 80) self.assertFalse(return_val) def test_is_spam_function_two_returns_true_pic_test_three(self): return_val = EmailAnalyzer.is_spam_function_two(True, 80, 65) self.assertTrue(return_val) def test_is_spam_function_two_returns_true_pic_test_four(self): return_val = EmailAnalyzer.is_spam_function_two(False, 20, 50) self.assertTrue(return_val) # Critère VNS def test_is_spam_function_two_returns_true_vns_test_one(self): return_val = EmailAnalyzer.is_spam_function_two(True, 80, 80) self.assertTrue(return_val) def test_is_spam_function_two_returns_true_vns_test_two(self): return_val = EmailAnalyzer.is_spam_function_two(True, 80, 65) self.assertTrue(return_val) def test_is_spam_function_two_returns_true_vns_test_three(self): return_val = EmailAnalyzer.is_spam_function_two(True, 20, 80) self.assertTrue(return_val) def test_is_spam_function_two_returns_true_vns_test_four(self): return_val = EmailAnalyzer.is_spam_function_two(False, 20, 65) self.assertTrue(return_val) def test_is_spam_function_two_returns_false_vns_test_five(self): return_val = EmailAnalyzer.is_spam_function_two(False, 20, 80) self.assertFalse(return_val) def test_is_spam_function_two_returns_false_vns_test_six(self): return_val = EmailAnalyzer.is_spam_function_two(False, 80, 60) self.assertFalse(return_val) def test_is_spam_function_two_returns_false_vns_test_seven(self): return_val = EmailAnalyzer.is_spam_function_two(False, 80, 80) self.assertFalse(return_val) @patch("email_analyzer.EmailAnalyzer.subject_spam_ham_prob") @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob") def test_is_spam_Returns_True_if_spam_prob_is_higher( self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob ): """ Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob". La sortie de la fonction doit être True si probabilité spam > probabilité ham (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ mock_subject_spam_ham_prob.return_value = (10, 0) mock_spam_ham_body_prob.return_value = (10, 0) is_spam_return_val = self.analyzer.is_spam("dummySubject", "dummyBody") self.assertTrue(is_spam_return_val) @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob") @patch("email_analyzer.EmailAnalyzer.subject_spam_ham_prob") def test_is_spam_Returns_False_if_spam_prob_is_lower( self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob ): """ Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob". La sortie de la fonction doit être False si probabilité spam probabilité ham (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ mock_subject_spam_ham_prob.return_value = (0, 10) mock_spam_ham_body_prob.return_value = (0, 10) is_spam_return_val = self.analyzer.is_spam("dummySubject", "dummyBody") self.assertFalse(is_spam_return_val) @patch("email_analyzer.EmailAnalyzer.clean_text") @patch("email_analyzer.EmailAnalyzer.calculate_ham_divided_by_email") @patch("email_analyzer.EmailAnalyzer.calculate_spam_divided_by_email") @patch("email_analyzer.EmailAnalyzer.load_dict") def test_spam_ham_body_prob_Returns_expected_probability(self, mock_load_dict, mock_calculate_spam_divided_by_email, mock_calculate_ham_divided_by_email, mock_clean_text): """ Il faut mocker la fonction "load_dict" Il faut vérifier que probabilité est calculée correctement donné le "body" à l'entrée (ces probabilites devront etre calcule selon l'enonce dans le TP1 ) """ mock_load_dict.return_value = self.vocab mock_calculate_ham_divided_by_email.return_value = 1 / 2 mock_calculate_spam_divided_by_email.return_value = 1 / 2 mock_clean_text.return_value = self.clean_body expected_return_value = ((0.5 * 0.2 * 0.2), (1 / 2 * 1 / 6 * 1 / 6)) self.assertEqual(self.analyzer.spam_ham_body_prob(self.body), expected_return_value) @patch("email_analyzer.EmailAnalyzer.clean_text") @patch("email_analyzer.EmailAnalyzer.calculate_ham_divided_by_email") @patch("email_analyzer.EmailAnalyzer.calculate_spam_divided_by_email") @patch("email_analyzer.EmailAnalyzer.load_dict") def test_subject_spam_ham_prob_Returns_expected_probability(self, mock_load_dict, mock_calculate_spam_divided_by_email, mock_calculate_ham_divided_by_email, mock_clean_text): """ Il faut mocker la fonction "load_dict" il faut vérifier que probabilité est calculée correctement donné le "sujet" a l'entrée (ces probabilites devron etre calcule selon l'enonce dans le TP1 ) """ mock_load_dict.return_value = self.vocab mock_calculate_ham_divided_by_email.return_value = 1 / 2 mock_calculate_spam_divided_by_email.return_value = 1 / 2 mock_clean_text.return_value = self.clean_subject expected_return_value = ((0.5 * 0.25), (0.5*1/3)) self.assertEqual(self.analyzer.subject_spam_ham_prob(self.subject), expected_return_value)
def test_is_spam_function_two_returns_false_vns_test_seven(self): return_val = EmailAnalyzer.is_spam_function_two(False, 80, 80) self.assertFalse(return_val)
def __init__(self): self.email_file = "train_set.json" self.crud = CRUD() self.e_mail = EmailAnalyzer()
for dirpath, subdirs, files in os.walk(rootdir): for file in files: if not file.startswith('.'): # Ignore hidden files email_filenames.append(os.path.join(dirpath, file)) """ Calling the function, and testing if the # of elements are the same """ for filename in email_filenames: with open(filename, "r") as f: data = f.read() email_df = json_normalize(json.loads(EmailAnalyzer().parse(data))) print(email_df) """ Attempting to iterate through the files and populate a larger JSON object """ dic_tmp = {} for x in range(len(email_filenames)): key_name = email_filenames[x] with open(email_filenames[x], "r") as f: data2 = f.read() data2_parsed = EmailAnalyzer().parse(data2) dic_tmp[key_name] = data2_parsed
def setUp(self): self.analyzer = EmailAnalyzer() self.subject_true = " no more outdated software ! upgrade !" self.body_true = "we get you the best deal ! skip the retail box and save !\namazing special # 1 :\nadobe - photoshop 7 premiere 7 illustrator 10 = only $ 120\namazing special # 2 :\nwindows xp professional + microsoft office xp professional = only $ 80\namazing special # 3 :\nadobe photoshop cs + adobe illustrator cs + adobe indesign cs\namazing special # 4 :\n" self.clean_subject_true = [ 'more', 'oudat', 'software', 'upgrade' ] # données pour mocker "return_value" du "clean_text" self.clean_body_true = [ 'get', 'best', 'deal', 'skip', 'retail', 'box', 'sav', 'amaz', 'special', 'adobe', 'photoshop', 'premiere', 'illustrator', 'only', 'windows', 'xp', 'professional', 'microsoft', 'office', 'cs', 'indesign' ] # données pour mocker "return_value" du "clean_text" self.subject_false = "re :" self.body_false = "we are using it for other things . mary joyce and robert have discussed with mcmahon and bowen .\n- - - - - original message - - - - -\nfrom : kitchen louise\nsent : monday december 10 2001 8 : 26 am\nto : oxley david\nsubject :\nwhat happens to the money in wachovia ?\nlouise kitchen\nchief operating \n" self.clean_subject_false = [ 're' ] # données pour mocker "return_value" du "clean_text" self.clean_body_false = [ 'us', 'other', 'thing', 'mary', 'joyce', 'robert', 'discuss', 'mcmahon', 'bowen', 'original', 'message', 'kitchen', 'louise', 'sent', 'monday', "december", 'oxley', 'david', 'subject', 'happen', 'money', 'wachovia' ] self.spam_ham_body_prob_true = ( 1, (1 / 6), ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_true = ( (2 / 3), (1 / 6), ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.spam_ham_body_prob_false = ( (1 / 4), (2 / 6), ) # données pour mocker "return_value" du "spam_ham_body_prob" self.subject_spam_ham_prob_false = ( 0, (1 / 2), ) # données pour mocker "return_value" du "subject_spam_ham_prob" self.vocab = ( { "p_sub_spam": { "upgrade": 1 / 3, "software": 1 / 3 }, "p_sub_ham": { "re": 1 / 2, "annoucement": 1 / 6, "more": 1 / 6 }, "p_body_spam": { "best": 1 / 4, "deal": 1 / 4, "skip": 1 / 4, "special": 1 / 4, "money": 1 / 4 }, "p_body_ham": { "today": 1 / 6, "professional": 1 / 6, "meet": 1 / 6, "discuss": 1 / 6, "sent": 1 / 6 } } ) # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict" # valeurs de la probabilité attendus : (0.5925*1/(256*pow(6,17))), (0.4075*1/pow(6,21)) self.spam_ham_body_prob_expected = (1.3673419333309543e-16, 1.8575963755415577e-17) # valeurs de la probabilité attendus : (0.5925*1/81, 0.4075*1/6*1/4*1/4*1/4) self.subject_spam_ham_prob_expected = (0.007314814814814815, 0.0010611979166666665)
class RENEGE: """Class pour realiser le filtrage du spam en utilisant vocabular.json file et CRUD et EmalAnalyze classes""" def __init__(self): self.email_file = "800-mails.json" self.crud = CRUD() self.e_mail = EmailAnalyzer() def calculate_user_trust(self, user_id): #extracting json data date_of_first_seen_message = self.crud.get_user_data(user_id, "Date_of_first_seen_message") date_of_last_seen_message = self.crud.get_user_data(user_id, "Date_of_last_seen_message") n_ham = self.crud.get_user_data(user_id, "HamN") n_spam = self.crud.get_user_data(user_id, "SpamN") groups = self.crud.get_user_data(user_id, "Groups") #calculate the sum of trust values of all groups sum_trust = 0 for group in groups : group_id = self.crud.get_group_id(group) sum_trust += self.crud.get_group_data(group_id, 'Trust') #now that we have all the needed vars, calculate trust1, trust2 and trust trust1 = (date_of_last_seen_message * n_ham) / (date_of_first_seen_message * (n_ham + n_spam)) trust2 = sum_trust / len(groups) trust = (trust1 + trust2) / 2 if trust2 < 50: trust = trust2 if trust1 > 100: trust = 100 #before returning a value, check if trust is between 0 and 100 if trust < 0: trust = 0 elif trust > 100: trust = 100 return trust def classify_emails(self, calculation_mode, is_log_est, is_log_combo): ''' fonction deja implemente Description: fonction pour commencer l'analyse des e-mails. Sortie: bool, 'True' pour succes, 'False' dans le cas de failure. ''' try: self.process_email(self.get_email(), calculation_mode, is_log_est, is_log_combo) return True except Exception: raise Exception def process_email(self, new_emails, calculation_mode, is_log_est, is_log_combo): ''' Description: fonction pour analyser chaque nouvel e-mail dans le dictionare. Elle gere l'ajout des nouveux utilisateurs et/ou modification de l'information existante sur les utilisateurs et groupes. Sortie: bool, 'True' pour succes, 'False' dans le cas de failure. ''' print(type(new_emails)) emails = new_emails["dataset"] for email in emails: email_adr = email['mail']['From'] date = email['mail']['Date'] spam = email['mail']['Spam'] == 'true' subject = email['mail']['Subject'] body = email['mail']['Body'] user_id = self.crud.get_user_id(email_adr) if user_id: self.update_user_info(email_adr, date, spam) else: self.add_user_info(email_adr, date) # trust de l'utilisateur trust = self.crud.get_user_data(user_id, "Trust") # moyenne du trust de tous les groupes user_group = self.crud.get_user_data(user_id, "Groups") sum_trust = 0 groups = self.crud.read_groups_file() for group in groups : if group['List_of_members'] == user_group: sum_trust += group['Trust'] avg_group_trust = sum_trust / len(groups) # difference de jours entre last_seen_msg et first_seen_msg user_activity = self.substract_dates(self.crud.get_user_data(email_adr, "Date_of_last_seen_message"), self.crud.get_user_data(email_adr, "Date_of_first_seen_message")) if(calculation_mode == 1): spam = self.e_mail.is_spam_function_one(spam, user_activity, trust, avg_group_trust) elif(calculation_mode == 2): spam = self.e_mail.is_spam_function_two(spam, trust, avg_group_trust) elif(calculation_mode == 0): spam = self.e_mail.is_spam(subject, body, is_log_est, is_log_combo, 0) # mettre a jour l'utilisateur avec la nouvelle valeur de spam self.update_user_info(email_adr, date, spam) return True def substract_dates(self, last_seen_msg, first_seen_msg): # calcul de difference de jours entre le dernier et le premier message last_seen_msg = str(last_seen_msg).split("-") first_seen_msg = str(first_seen_msg).split("-") user_activity = date(int(last_seen_msg[0]), int(last_seen_msg[1]), int(last_seen_msg[2])) - date(int(first_seen_msg[0]), int(first_seen_msg[1]), int(first_seen_msg[2])) # convertir la soustraction en int user_activity = int((str(user_activity).split(" "))[0]) return user_activity
def test_is_spam_function_one_returns_false_acc_test_two(self): return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 80) self.assertFalse(return_val)
def test_is_spam_function_one_returns_true_icc_test_three(self): return_val = EmailAnalyzer.is_spam_function_one(False, 15, 65, 50) self.assertTrue(return_val)
def test_is_spam_function_one_returns_false_acc_test_four(self): return_val = EmailAnalyzer.is_spam_function_one(False, 35, 50, 50) self.assertFalse(return_val)
def test_is_spam_function_one_returns_false_icc_test_four(self): return_val = EmailAnalyzer.is_spam_function_one(True, 15, 65, 50) self.assertTrue(return_val)
def test_is_spam_function_one_returns_true_acc_test_seven(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 50) self.assertTrue(return_val)
def test_is_spam_function_one_returns_false_icc_test_twelve(self): return_val = EmailAnalyzer.is_spam_function_one(False, 20, 65, 60) self.assertTrue(return_val)
def __init__(self): self.email_file = "800-mails.json" self.crud = CRUD() self.e_mail = EmailAnalyzer()
def test_is_spam_function_one_returns_false_icc_test_sixteen(self): return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 50) self.assertTrue(return_val)