class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self): self.vocab = "vocabulary.json" self.cleaning = TextCleaning() self.words = VocabularyCreator() @staticmethod def is_spam_function_one(is_msg_spam, user_historic_in_days, user_trust, user_group_trust): p = is_msg_spam h = user_historic_in_days < 30 t1 = user_trust < 60 t2 = user_group_trust < 70 t3 = user_trust > 75 result = p and (h and t1 or t2) or h and t2 and not t3 return result @staticmethod def is_spam_function_two(is_msg_spam, user_trust, user_group_trust): p = is_msg_spam t2 = user_group_trust < 70 t3 = user_trust > 75 result = p or not t3 and t2 return result def is_spam(self, subject_orig, body_orig, isLogEstimation, isLogCombination, k): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() if (isLogEstimation): pSpamSubject, pHamSubject = self.subject_spam_ham_log_prob(subject_orig, pSpam, pHam) pSpamBody, pHamBody = self.subject_spam_ham_log_prob(body_orig, pSpam, pHam) estimationpSpamSubject = math.log10(pSpam) + pSpamSubject estimationpHamSubject = math.log10(pHam) + pHamSubject estimationpSpamBody = math.log10(pSpam) + pSpamBody estimationpHamBody = math.log10(pHam) + pHamBody else: pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig) pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig) estimationpSpamSubject = pSpam * pSpamSubject estimationpHamSubject = pHam * pHamSubject estimationpSpamBody = pSpam * pSpamBody estimationpHamBody = pHam * pHamBody if (isLogCombination): # s'assurer que l'estimation est strictement plus grand que 0 afin de pouvoir faire le logarithme # seul ceux qui sont strictement positif auront appliquer la fonction math.log10 if (estimationpSpamSubject > 0): estimationpSpamSubject = math.log10(estimationpSpamSubject) if (estimationpHamSubject > 0): estimationpHamSubject = math.log10(estimationpHamSubject) if (estimationpSpamBody > 0): estimationpSpamBody = math.log10(estimationpSpamBody) if (estimationpHamBody > 0): estimationpHamBody = math.log10(estimationpHamBody) # s'assurer que la valeur de k est entre 0 et 1 # si elle est plus grand que 1, le rendre en une valeur entre 0 et 1 if (k > 1): k = k / math.pow(10, len(str(k))) elif (k < 0): k = 0 # la formule de combinaison de prob est pareil pour les 2 options # a l'exception de la valeur des parametres d'estimation qui auront applique le logarithme si approprie combinationpSpam = k * estimationpSpamSubject + (1 - k) * estimationpSpamBody combinationpHam = k * estimationpHamSubject + (1 - k) * estimationpHamBody return combinationpSpam > combinationpHam def subject_spam_ham_log_prob(self, subject, pSpam, pHam): vocabulary = self.load_dict() pSpamSubject = pSpam pHamSubject = pHam # calcul de probabilite de spam ou ham dans le body subject = self.clean_text(subject) for word in subject: if word in dict(vocabulary['spam_body']): pSpamSubject += dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamSubject += dict(vocabulary['ham_body'])[word] # logarithme de pSpamBody et pHamBody pHamSubject = math.log10(pHamSubject) pSpamSubject = math.log10(pSpamSubject) if pSpam == pSpamSubject: pSpamBody = 0 elif pHam == pHamSubject: pHamBody = 0 return pSpamBody, pHamBody def spam_ham_body_log_prob(self, body, pSpam, pHam): vocabulary = self.load_dict() pSpamBody = pSpam pHamBody = pHam # calcul de probabilite de spam ou ham dans le body body = self.clean_text(body) for word in body: if word in dict(vocabulary['spam_body']): pSpamBody += dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamBody += dict(vocabulary['ham_body'])[word] # logarithme de pSpamBody et pHamBody pHamBody = math.log10(pHamBody) pSpamBody = math.log10(pSpamBody) if pSpam == pSpamBody: pSpamBody = 0 elif pHam == pHamBody: pHamBody = 0 return pSpamBody, pHamBody def spam_ham_body_prob(self, body): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamBody = pSpam pHamBody = pHam # calcul de probabilite de spam ou ham dans le body body = self.clean_text(body) for word in body: if word in dict(vocabulary['spam_body']): pSpamBody *= dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamBody *= dict(vocabulary['ham_body'])[word] if pSpam == pSpamBody: pSpamBody = 0 elif pHam == pHamBody: pHamBody = 0 return pSpamBody, pHamBody def subject_spam_ham_prob(self, subject): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamSubject = pSpam pHamSubject = pHam # calcul de probabilite de spam ou ham dans le sujet subject = self.clean_text(subject) spam_dict = dict(vocabulary['spam_sub']) ham_dict = dict(vocabulary['ham_sub']) for word in subject: if word in spam_dict: pSpamSubject *= spam_dict[word] if word in ham_dict: pHamSubject *= ham_dict[word] if pSpam == pSpamSubject: pSpamSubject = 0 elif pHam == pHamSubject: pHamSubject = 0 return pSpamSubject, pHamSubject def calculate_spam_divided_by_email(self): # pragma: no cover return self.words.count_spam() / self.words.count_emails() def calculate_ham_divided_by_email(self): # pragma: no cover return self.words.count_ham() / self.words.count_emails() def load_dict(self): # pragma: no cover with open(self.vocab) as file: vocabulary = json.load(file) return vocabulary def clean_text(self, text): # pragma: no cover return self.cleaning.clean_text(text, 0)
class VocabularyCreator: """Class for creating vocabulary of spam and non-spam messages""" def __init__(self): self.train_set = "train_set.json" self.cleaning = TextCleaning() self.vocabulary = "vocabulary.json" self.voc_data = {} def compute_proba(self, data, word_freq, total): ''' Description: calcul la probabilité de chaque mot du dictionnaire basé sur sa fréquence d'occurence Sortie: le dictionnaire des probabilité pour chaque mot ''' proba_dict = {} for wd in data: if data[wd] < 1 or data[wd] > 4: continue elif data[w] == word_freq: proba_dict[wd] = data[wd] / total return proba_dict def create_vocab(self, word_freq, mode): ''' Description: fonction pour creer le vocabulaire des mots presents dans les e-mails spam et ham et le sauvegarder dans le fichier vocabulary.json selon le format specifie dans la description de lab Sortie: bool, 'True' pour succes, 'False' dans le cas de failure. ''' print("Creating vocabulary") dataset = self.load_dict() occ_spam_sub = {} occ_spam_bod = {} occ_ham_sub = {} occ_ham_bod = {} total_occ_spam_sub = 0 total_occ_ham_sub = 0 total_occ_spam_bod = 0 total_occ_ham_bod = 0 email_count = len(dataset["dataset"]) i = 0 # Analyze each email for email in dataset["dataset"]: i += 1 print("\rEmail " + str(i) + "/" + str(email_count), end="") # Get data data = email["mail"] subject = data["Subject"] body = data["Body"] is_spam = False # Update the number of spams / hams if data["Spam"] == "true": is_spam = True # Analyze the subject subject = self.cleaning.clean_text(subject, mode) if is_spam: for wd in subject: total_occ_spam_sub += 1 # Add the word to the dictionary or update its occurence count if wd not in occ_spam_sub: occ_spam_sub[wd] = 1 else: occ_spam_sub[wd] += 1 else: for wd in subject: total_occ_ham_sub += 1 # Add the word to the dictionary or update its occurence count if wd not in occ_ham_sub: occ_ham_sub[wd] = 1 else: occ_ham_sub[wd] += 1 # Analyze the body body = self.cleaning.clean_text(body, mode) if is_spam: for wd in body: total_occ_spam_bod += 1 # Add the word to the dictionary or update its occurence count if wd not in occ_spam_bod: occ_spam_bod[wd] = 1 else: occ_spam_bod[wd] += 1 else: for wd in body: total_occ_ham_bod += 1 # Add the word to the dictionary or update its occurence count if wd not in occ_ham_bod: occ_ham_bod[wd] = 1 else: occ_ham_bod[wd] += 1 # Create the data dictionary p_sub_spam = self.compute_proba(occ_spam_sub, word_freq, total_occ_spam_sub) p_sub_ham = self.compute_proba(occ_ham_sub, word_freq, total_occ_ham_sub) p_body_spam = self.compute_proba(occ_spam_bod, word_freq, total_occ_spam_bod) p_body_ham = self.compute_proba(occ_ham_bod, word_freq, total_occ_ham_bod) self.voc_data = { "p_sub_spam": p_sub_spam, "p_sub_ham": p_sub_ham, "p_body_spam": p_body_spam, "p_body_ham": p_body_ham } # Save data with open(self.vocabulary, "w") as outfile: json.dump(self.voc_data, outfile, indent=4) print("\n") return True def load_dict(self): with open(self.train_set) as json_data: data_dict = json.load(json_data) return data_dict def write_data_to_vocab_file(self, vocab): try: with open(self.vocabulary, "w") as outfile: json.dump(vocab, outfile) print("Vocab created") return True except: return False def clean_text(self, text, mode): return self.cleaning.clean_text(text, mode)
class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self): self.vocab = "vocabulary.json" self.cleaning = TextCleaning() self.words = VocabularyCreator() def is_spam(self, subject_orig, body_orig): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig) pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig) # calcul de pSpam et pHam en faisant la moyenne pSpam = (pSpamSubject + pSpamBody) / 2 pHam = (pHamSubject + pHamBody) / 2 return pSpam > pHam def spam_ham_body_prob(self, body): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamBody = pSpam pHamBody = pHam # calcul de probabilite de spam ou ham dans le body body = self.clean_text(body) for word in body: if word in dict(vocabulary['spam_body']): pSpamBody *= dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamBody *= dict(vocabulary['ham_body'])[word] if pSpam == pSpamBody: pSpamBody = 0 elif pHam == pHamBody: pHamBody = 0 return pSpamBody, pHamBody def subject_spam_ham_prob(self, subject): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamSubject = pSpam pHamSubject = pHam # calcul de probabilite de spam ou ham dans le sujet subject = self.clean_text(subject) spam_dict = dict(vocabulary['spam_sub']) ham_dict = dict(vocabulary['ham_sub']) for word in subject: if word in spam_dict: pSpamSubject *= spam_dict[word] if word in ham_dict: pHamSubject *= ham_dict[word] if pSpam == pSpamSubject: pSpamSubject = 0 elif pHam == pHamSubject: pHamSubject = 0 return pSpamSubject, pHamSubject def calculate_spam_divided_by_email(self): # pragma: no cover return self.words.count_spam() / self.words.count_emails() def calculate_ham_divided_by_email(self): # pragma: no cover return self.words.count_ham() / self.words.count_emails() def load_dict(self): # pragma: no cover with open(self.vocab) as file: vocabulary = json.load(file) return vocabulary def clean_text(self, text): # pragma: no cover return self.cleaning.clean_text(text)
class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self): self.vocab = "vocabulary.json" self.cleaning = TextCleaning() def clean_text(self, text, mode): # pragma: no cover return self.cleaning.clean_text(text, mode) def load_vocab(self): # pragma: no cover with open(self.vocab) as json_data: vocab = json.load(json_data) return vocab def is_spam_with_params(self, subject_orig, body_orig, is_normal_estimation, is_normal_combination, cleaning_mode, k): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' if is_normal_estimation: prob_message_subject = self.subject_spam_ham_prob( subject_orig, cleaning_mode) prob_message_body = self.body_spam_ham_prob( body_orig, cleaning_mode) subject_spam = prob_message_subject[0] body_spam = prob_message_body[0] subject_ham = prob_message_subject[1] body_ham = prob_message_body[1] else: prob_message_subject = self.subject_spam_ham_prob_log( subject_orig, cleaning_mode) prob_message_body = self.body_spam_ham_prob_log( subject_orig, cleaning_mode) subject_spam = math.pow(prob_message_subject[0], 10) body_spam = math.pow(prob_message_body[0], 10) subject_ham = math.pow(prob_message_subject[1], 10) body_ham = math.pow(prob_message_body[1], 10) if is_normal_combination: prob_message_spam = k * subject_spam + (1 - k) * body_spam prob_message_ham = k * subject_ham + (1 - k) * body_ham else: # il faut traiter les cas ou x dans log(x) est egal a 0 ou tres petit if not subject_spam <= 0: if not body_spam <= 0: prob_message_spam = k * math.log10(subject_spam) + ( 1 - k) * math.log10(body_spam) else: prob_message_spam = k * math.log10(subject_spam) + ( 1 - k) * body_spam elif not body_spam <= 0: prob_message_spam = k * subject_spam + ( 1 - k) * math.log10(body_spam) else: prob_message_spam = k * subject_spam + (1 - k) * body_spam if not subject_ham <= 0: if not body_ham <= 0: prob_message_ham = k * math.log10(subject_ham) + ( 1 - k) * math.log10(body_ham) else: prob_message_ham = math.log10(subject_ham) + (1 - k) * body_ham elif body_ham <= 0: prob_message_ham = k * subject_ham + (1 - k) * math.log10(body_ham) else: prob_message_ham = subject_ham + (1 - k) * body_ham max_prob = max(prob_message_spam, prob_message_ham) return True if max_prob == prob_message_spam else False def is_spam(self, subject_orig, body_orig): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' prob_message_subject = self.subject_spam_ham_prob(subject_orig) prob_message_body = self.body_spam_ham_prob(body_orig) # equation 7 de l'enonce prob_message_spam = (2 / 10 * prob_message_subject[0]) + ( 8 / 10 * prob_message_body[0]) prob_message_ham = (4 / 10 * prob_message_subject[1]) + ( 6 / 10 * prob_message_body[1]) #equation 3 de l'enonce max_prob = max(prob_message_spam, prob_message_ham) return True if max_prob == prob_message_spam else False def total_emails(self, file): ''' Description: fonction qui calcule le nombre de courriel dans un fichier quelconque. Sortie: int; il s'agit du nombre total de messages ''' with open(file) as f: input_file = json.load(f) email = input_file["dataset"] total = len(email) return total def probability_email_type(self, file): ''' Description: fonction qui calcule la probabilite qu'un message soit spam ou ham dans un fichier Sortie: int, int: P(spam), P(ham) ''' spam_counter = 0 ham_counter = 0 with open(file) as f: input_file = json.load(f) for email in input_file["dataset"]: individual_email = email["mail"] spam_bool = individual_email["Spam"] if spam_bool == "true": spam_counter += 1 else: ham_counter += 1 total = self.total_emails(file) # equation 5: P(spam) = nb de messages spam/nb total de messages return spam_counter / total, ham_counter / total def body_spam_ham_prob(self, body, cleaning_mode): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' body_spam_prob = 1 body_ham_prob = 1 probability_spam_ham = self.probability_email_type('train-emails.json') prob_spam = probability_spam_ham[0] prob_ham = probability_spam_ham[1] with open(self.vocab) as f: input_file = json.load(f) # clean_body = self.cleaning.clean_text(body) clean_body = self.clean_text(body, cleaning_mode) num_words_spam_body = len(input_file["spam_body"]) num_words_ham_body = len(input_file["ham_body"]) for body_word in clean_body: if body_word in input_file["spam_body"]: body_spam_prob *= input_file["spam_body"][body_word] elif body_word in input_file["ham_body"] or body_word in input_file[ "ham_sub"] or body_word in input_file["spam_sub"]: body_spam_prob *= 1 / (num_words_spam_body + 1) if body_word in input_file["ham_body"]: body_ham_prob *= input_file["ham_body"][body_word] elif body_word in input_file[ "spam_body"] or body_word in input_file[ "ham_sub"] or body_word in input_file["spam_sub"]: body_ham_prob *= 1 / (num_words_ham_body + 1) ''' for word in input_file["spam_body"]: if body_word == word: body_spam_prob *= input_file["spam_body"][word] for word in input_file["ham_body"]: if body_word == word: body_ham_prob *= input_file["ham_body"][word] ''' return prob_spam * body_spam_prob, prob_ham * body_ham_prob def body_spam_ham_prob_log(self, body, cleaning_mode): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' body_spam_prob = 0 body_ham_prob = 0 probability_spam_ham = self.probability_email_type('train-emails.json') if probability_spam_ham[0] > 0: prob_spam = math.log10(probability_spam_ham[0]) else: prob_spam = probability_spam_ham[0] if probability_spam_ham[1] > 0: prob_ham = math.log10(probability_spam_ham[1]) else: prob_ham = probability_spam_ham[1] with open(self.vocab) as f: input_file = json.load(f) # clean_body = self.cleaning.clean_text(body) clean_body = self.clean_text(body, cleaning_mode) num_words_spam_body = len(input_file["spam_body"]) num_words_ham_body = len(input_file["ham_body"]) for body_word in clean_body: if body_word in input_file["spam_body"]: if input_file["spam_body"][body_word] > 0: # pas certaine de comment modifier cette partie, pour l'instant += math.log10()() body_spam_prob += math.log10( input_file["spam_body"][body_word]) else: body_spam_prob += input_file["spam_body"][body_word] elif body_word in input_file["ham_body"] or body_word in input_file[ "ham_sub"] or body_word in input_file["spam_sub"]: body_spam_prob += math.log10(1 / (num_words_spam_body + 1)) if body_word in input_file["ham_body"]: if input_file["ham_body"][body_word] > 0: body_ham_prob += math.log10( input_file["ham_body"][body_word]) else: body_ham_prob += input_file["ham_body"][body_word] elif body_word in input_file[ "spam_body"] or body_word in input_file[ "ham_sub"] or body_word in input_file["spam_sub"]: #pareil body_ham_prob += math.log10(1 / (num_words_ham_body + 1)) ''' for word in input_file["spam_body"]: if body_word == word: body_spam_prob *= input_file["spam_body"][word] for word in input_file["ham_body"]: if body_word == word: body_ham_prob *= input_file["ham_body"][word] ''' return prob_spam + body_spam_prob, prob_ham + body_ham_prob def subject_spam_ham_prob(self, subject, cleaning_mode): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' sub_spam_prob = 1 sub_ham_prob = 1 probability_spam_ham = self.probability_email_type('train-emails.json') prob_spam = probability_spam_ham[0] prob_ham = probability_spam_ham[1] with open(self.vocab) as f: input_file = json.load(f) # clean_body = self.cleaning.clean_text(body) clean_sub = self.clean_text(subject, cleaning_mode) num_words_spam_sub = len(input_file["spam_sub"]) num_words_ham_sub = len(input_file["ham_sub"]) for subject_word in clean_sub: if subject_word in input_file["spam_sub"]: sub_spam_prob *= input_file["spam_sub"][subject_word] elif subject_word in input_file[ "spam_body"] or subject_word in input_file[ "ham_sub"] or subject_word in input_file["spam_sub"]: sub_spam_prob *= 1 / (num_words_spam_sub + 1) if subject_word in input_file["ham_sub"]: sub_ham_prob *= input_file["ham_sub"][subject_word] elif subject_word in input_file[ "spam_body"] or subject_word in input_file[ "ham_body"] or subject_word in input_file["spam_sub"]: sub_ham_prob *= 1 / (num_words_ham_sub + 1) ''' for word in input_file["spam_sub"]: if subject_word == word: sub_spam_prob *= input_file["spam_sub"][word] for word in input_file["ham_sub"]: if subject_word == word: sub_ham_prob *= input_file["ham_sub"][word] ''' return prob_spam * sub_spam_prob, prob_ham * sub_ham_prob def subject_spam_ham_prob_log(self, subject, cleaning_mode): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' sub_spam_prob = 0 sub_ham_prob = 0 probability_spam_ham = self.probability_email_type('train-emails.json') prob_spam = math.log10(probability_spam_ham[0]) prob_ham = math.log10(probability_spam_ham[1]) with open(self.vocab) as f: input_file = json.load(f) # clean_body = self.cleaning.clean_text(body) clean_sub = self.clean_text(subject, cleaning_mode) num_words_spam_sub = len(input_file["spam_sub"]) num_words_ham_sub = len(input_file["ham_sub"]) for subject_word in clean_sub: if subject_word in input_file["spam_sub"]: if input_file["spam_sub"][subject_word] > 0: sub_spam_prob += math.log10( input_file["spam_sub"][subject_word]) else: sub_spam_prob += (input_file["spam_sub"][subject_word]) #pas certaine de comment modifier cette partie, pour l'instant += math.log10()() elif subject_word in input_file[ "spam_body"] or subject_word in input_file[ "ham_sub"] or subject_word in input_file["spam_sub"]: sub_spam_prob += math.log10(1 / (num_words_spam_sub + 1)) if subject_word in input_file["ham_sub"]: if input_file["ham_sub"][subject_word] > 0: sub_ham_prob += math.log10( input_file["ham_sub"][subject_word]) else: sub_ham_prob += input_file["ham_sub"][subject_word] #pareil elif subject_word in input_file[ "spam_body"] or subject_word in input_file[ "ham_body"] or subject_word in input_file["spam_sub"]: sub_ham_prob += math.log10(1 / (num_words_ham_sub + 1)) return prob_spam + sub_spam_prob, prob_ham + sub_ham_prob
class VocabularyCreator: """Class for creating vocabulary of spam and non-spam messages""" def __init__(self): self.train_set = "800-mails.json" self.cleaning = TextCleaning() self.vocabulary = "vocabulary.json" def create_vocab(self): ''' Description: fonction pour creer le vocabulaire des mots presents dans les e-mails spam et ham et le sauvegarder dans le fichier vocabulary.json selon le format specifie dans la description de lab Sortie: bool, 'True' pour succes, 'False' dans le cas de failure. ''' try: with open(self.train_set, "r") as read_file: emails = json.load(read_file) except: print("ERROR IN READING FILE") raise Exception spamSubjects = [] hamSubjects = [] spamBodies = [] hamBodies = [] vocabulary = { "spam_sub": {}, "ham_sub": {}, "spam_body": {}, "ham_body": {} } #ajout des sujets et des corps des emails de 800-mails.json dans le tableau respectif for email in emails["dataset"]: if email["mail"]["Spam"] == "true": spamSubjects.extend( self.cleaning.clean_text(email["mail"]["Subject"])) spamBodies.extend( self.cleaning.clean_text(email["mail"]["Body"])) else: hamSubjects.extend( self.cleaning.clean_text(email["mail"]["Subject"])) hamBodies.extend( self.cleaning.clean_text(email["mail"]["Body"])) spam_sub = list(dict.fromkeys(spamSubjects)) ham_sub = list(dict.fromkeys(hamSubjects)) spam_body = list(dict.fromkeys(spamBodies)) ham_body = list(dict.fromkeys(hamBodies)) #calcul des probabilites spam et ham pour chaque mots for word in spam_sub: vocabulary["spam_sub"][word] = spamSubjects.count(word) / len( spamSubjects) for word in ham_sub: vocabulary["ham_sub"][word] = hamSubjects.count(word) / len( hamSubjects) for word in spam_body: vocabulary["spam_body"][word] = spamBodies.count(word) / len( spamBodies) for word in ham_body: vocabulary["ham_body"][word] = hamBodies.count(word) / len( hamBodies) try: with open(self.vocabulary, "w") as write_file: json.dump(vocabulary, write_file, indent=1) return True except: return False #fonction qui retourne le nombre de courriel spam def count_spam(self): with open(self.train_set) as read_file: emails = json.load(read_file) nSpams = 0 for email in emails["dataset"]: if email["mail"]["Spam"] == "true": nSpams += 1 return nSpams #fonction qui retourne le nombre de courriel def count_emails(self): with open(self.train_set) as read_file: emails = json.load(read_file) return len(emails["dataset"]) #fonction qui retourne le nombre de courriel ham def count_ham(self): nHams = self.count_spam() - self.count_emails() return nHams
class VocabularyCreator: """Class for creating vocabulary of spam and non-spam messages""" def __init__(self): self.train_set = "800-mails.json" self.cleaning = TextCleaning() self.vocabulary = "vocabulary.json" def create_vocab(self, minWordFreq, clean_text_mode): ''' Description: fonction pour creer le vocabulaire des mots presents dans les e-mails spam et ham et le sauvegarder dans le fichier vocabulary.json selon le format specifie dans la description de lab Sortie: bool, 'True' pour succes, 'False' dans le cas de failure. ''' emails = self.load_dict() spamSubjects = [] hamSubjects = [] spamBodies = [] hamBodies = [] vocabulary = { "spam_sub": {}, "ham_sub": {}, "spam_body": {}, "ham_body": {} } # ajout des sujets et des corps des emails de 800-mails.json dans le tableau respectif for email in emails["dataset"]: if email["mail"]["Spam"] == "true": spamSubjects.extend( self.clean_text(email["mail"]["Subject"], clean_text_mode)) spamBodies.extend( self.clean_text(email["mail"]["Body"], clean_text_mode)) else: hamSubjects.extend( self.clean_text(email["mail"]["Subject"], clean_text_mode)) hamBodies.extend( self.clean_text(email["mail"]["Body"], clean_text_mode)) spam_sub = list(dict.fromkeys(spamSubjects)) ham_sub = list(dict.fromkeys(hamSubjects)) spam_body = list(dict.fromkeys(spamBodies)) ham_body = list(dict.fromkeys(hamBodies)) # calcul des probabilites spam et ham pour chaque mots if (minWordFreq > 4): minWordFreq = 4 elif (minWordFreq < 1): minWordFreq = 1 for word in spam_sub: if (spamSubjects.count(word) >= minWordFreq): vocabulary["spam_sub"][word] = spamSubjects.count(word) / len( spamSubjects) for word in ham_sub: if (hamSubjects.count(word) >= minWordFreq): vocabulary["ham_sub"][word] = hamSubjects.count(word) / len( hamSubjects) for word in spam_body: if (spamBodies.count(word) >= minWordFreq): vocabulary["spam_body"][word] = spamBodies.count(word) / len( spamBodies) for word in ham_body: if (hamBodies.count(word) >= minWordFreq): vocabulary["ham_body"][word] = hamBodies.count(word) / len( hamBodies) self.write_data_to_vocab_file(vocabulary) return vocabulary # fonction qui retourne le nombre de courriel spam def count_spam(self): emails = self.load_dict() nSpams = 0 for email in emails["dataset"]: if email["mail"]["Spam"] == "true": nSpams += 1 return nSpams # fonction qui retourne le nombre de courriel def count_emails(self): emails = self.load_dict() return len(emails["dataset"]) # fonction qui retourne le nombre de courriel ham def count_ham(self): nHams = self.count_emails() - self.count_spam() return nHams def load_dict(self): # pragma: no cover with open(self.train_set) as json_data: data_dict = json.load(json_data) return data_dict def write_data_to_vocab_file(self, vocab): # pragma: no cover try: with open(self.vocabulary, 'w') as outfile: json.dump(vocab, outfile) print('vocabulary created......') return True except: return False def clean_text(self, text, cleaning_mode): # pragma: no cover return self.cleaning.clean_text(text, cleaning_mode)
class VocabularyCreator: """Class for creating vocabulary of spam and non-spam messages""" def __init__(self): self.train_set = "1000-mails.json" self.cleaning = TextCleaning() self.vocabulary = "vocabulary.json" def load_dict(self, file): # pragma: no cover with open(file) as json_data: data_dict = json.load(json_data) return data_dict def write_data_to_vocab_file(self, vocab): # pragma: no cover try: with open(self.vocabulary, "w") as outfile: json.dump(vocab, outfile) print("Vocabulary created...") return True except: return False def clean_text(self, text, option): # pragma: no cover return self.cleaning.clean_text(text, option) def total_words_spam_ham_section(self, file, section, cleaning_option): ''' Description: fonction pour calculer le nombre de mots dans les e-mails spam et ham, et pour recueillir les mots de chaque type de e-mail. Sortie: int, int, list, list; les deux premières pour le nombre de mots dans chaque type de courriel, les deux dernières retou- rnent une liste de mots regroupée pour chaque type. ''' spam_section_words = [] ham_section_words = [] input_file = self.load_dict(file) for email in input_file["dataset"]: individual_email = email["mail"] spam_bool = individual_email["Spam"] clean_subject = self.cleaning.clean_text( (individual_email[section]), cleaning_option) if spam_bool == "true": spam_section_words += clean_subject else: ham_section_words += clean_subject return len(spam_section_words), len( ham_section_words), spam_section_words, ham_section_words def probability_email_type_section_words(self, file, section, email_type, word_frequency, cleaning_option): ''' Description: fonction pour calculer la probabilite de chaque mot d'une certaine section du courriel (subject ou body) d'un certain type de courriel (spam ou ham). En d'autres mots, cette fonction a pour populer les quatre types de vocabulaire (spam_sub, ham_sub, ...) dans vocabullary.json Sortie: dictionnaire de mots ''' word_probability_dict = {} email_type_section_information = self.total_words_spam_ham_section( file, section, cleaning_option) if email_type == "Spam": total_words_email_type_section = email_type_section_information[0] list_words_email_type_section = email_type_section_information[2] else: total_words_email_type_section = email_type_section_information[1] list_words_email_type_section = email_type_section_information[3] for word in list_words_email_type_section: same_word_counter = sum( word == analyzed_word for analyzed_word in list_words_email_type_section) word_probability = same_word_counter / total_words_email_type_section if same_word_counter >= word_frequency: word_probability_dict[word] = round(word_probability, 4) return word_probability_dict def create_vocab(self, word_frequency, cleaning_option): ''' Description: fonction pour creer le vocabulaire des mots presents dans les e-mails spam et ham et le sauvegarder dans le fichier vocabulary.json selon le format specifie dans la description de lab Sortie: bool, 'True' pour succes, 'False' dans le cas de failure. ''' try: if word_frequency < 1 or word_frequency > 4: return False vocab_dict = {} vocab_dict["spam_sub"] = self.probability_email_type_section_words( self.train_set, "Subject", "Spam", word_frequency, cleaning_option) vocab_dict["ham_sub"] = self.probability_email_type_section_words( self.train_set, "Subject", "Ham", word_frequency, cleaning_option) vocab_dict[ "spam_body"] = self.probability_email_type_section_words( self.train_set, "Body", "Spam", word_frequency, cleaning_option) vocab_dict["ham_body"] = self.probability_email_type_section_words( self.train_set, "Body", "Ham", word_frequency, cleaning_option) self.write_data_to_vocab_file(vocab_dict) return vocab_dict except FileNotFoundError as e: print("Error!", e.__class__, "occurred.") print("File", e.filename, "was not found") return False
class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self, vocab_file): self.vocab = vocab_file self.cleaning = TextCleaning() self.voc_data = {} def is_spam(self, subject_orig, body_orig): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' # Clean email's subject and body email_subject = self.clean_text(subject_orig) email_body = self.clean_text(body_orig) # Get the spam/ham probabilities p_subject_spam, p_subject_ham = self.spam_ham_subject_prob(email_subject) p_body_spam, p_body_ham = self.spam_ham_body_prob(email_body) # Compute the merged probabilities p_spam = 0.5 * (p_subject_spam + p_body_spam) p_ham = 0.5 * (p_subject_ham + p_body_ham) # Decide is the email is spam or ham if p_spam > p_ham: return True else: return False def spam_ham_body_prob(self, body): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' p_spam = 1.0 p_ham = 1.0 voc_data = self.load_dict() # Walk the text to compute the probability for word in body: # Check the spam probability if word in voc_data["p_body_spam"]: p_spam *= voc_data["p_body_spam"][word] else: p_spam *= 1.0 / (len(voc_data["p_body_spam"]) + 1.0) # Check the ham probability if word in voc_data["p_body_ham"]: p_ham *= voc_data["p_body_ham"][word] else: p_ham *= 1.0 / (len(voc_data["p_body_ham"]) + 1.0) p_spam *= 0.5925 p_ham *= 0.4075 return (p_spam, p_ham) def spam_ham_subject_prob(self, subject): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' p_spam = 1.0 p_ham = 1.0 voc_data = self.load_dict() # Walk the text to compute the probability for word in subject: # Check the spam probability if word in voc_data["p_sub_spam"]: p_spam *= voc_data["p_sub_spam"][word] else: p_spam *= 1.0 / (len(voc_data["p_sub_spam"]) + 1.0) # Check the ham probability if word in voc_data["p_sub_ham"]: p_ham *= voc_data["p_sub_ham"][word] else: p_ham *= 1.0 / (len(voc_data["p_sub_ham"]) + 1.0) p_spam *= 0.5925 p_ham *= 0.4075 return (p_spam, p_ham) def clean_text(self, text): return self.cleaning.clean_text(text) def load_dict(self): # Open vocabulary with open(self.vocab) as json_data: vocabu = json.load(json_data) return vocabu
class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self): self.vocab = "vocabulary.json" self.cleaning = TextCleaning() self.voc_data = {} def is_spam(self, subject_orig, body_orig, is_log_estimation, is_log_combination, clean_text_mode, k): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' if is_log_estimation: p_subject = self.log_p_spam_ham_subject(subject_orig, clean_text_mode) p_body = self.log_p_spam_ham_body(subject_orig, clean_text_mode) p_spam_subject = math.pow(p_subject[0], 10) p_spam_body = math.pow(p_body[0], 10) p_ham_subject = math.pow(p_subject[1], 10) p_ham_body = math.pow(p_body[1], 10) else: p_subject = self.subject_spam_ham_prob(subject_orig, clean_text_mode) p_body = self.body_spam_ham_prob(body_orig, clean_text_mode) p_spam_subject = p_subject[0] p_spam_body = p_body[0] p_ham_subject = p_subject[1] p_ham_body = p_body[1] if is_log_combination: if p_spam_subject > 0: if p_spam_body > 0: #case where pspam_subject and pspam_body are both positive p_spam = k * math.log10(p_spam_subject) + (1 - k) * math.log10(p_spam_body) else: #case where pspam_subject is positive and pspam_body is negative p_spam = k * math.log10(p_spam_subject) + (1 - k) * p_spam_body elif p_spam_body > 0: #case where pspam_subject is negative and pspam_body is positive p_spam = k*p_spam_subject + (1 - k) * math.log10(p_spam_body) else: #case where pspam_subject and pspam_body are both negative p_spam = k * p_spam_subject + (1 - k) * p_spam_body if p_ham_subject > 0: if p_ham_body > 0: p_ham = k * math.log10(p_ham_subject) + (1 - k) * math.log10(p_ham_body) else: p_ham = math.log10(p_ham_subject) + (1 - k) * p_ham_body elif p_ham_body <= 0: p_ham = k * p_ham_subject + (1 - k) * math.log10(p_ham_body) else: p_ham = p_ham_subject + (1 - k) * p_ham_body else: p_spam = k * p_spam_subject + (1 - k) * p_spam_body p_ham = k * p_ham_subject + (1 - k) * p_ham_body if p_spam > p_ham: return True return False # def is_spam(self, subject_orig, body_orig): # ''' # Description: fonction pour verifier si e-mail est spam ou ham, # en calculant les probabilites d'etre spam et ham, # donnee le sujet et le texte d'email. # Sortie: 'True' - si l'email est spam, 'False' - si email est ham. # ''' # # Clean email's subject and body # email_subject = self.clean_text(subject_orig) # email_body = self.clean_text(body_orig) # # Get the spam/ham probabilities # p_subject_spam, p_subject_ham = self.spam_ham_subject_prob(email_subject) # p_body_spam, p_body_ham = self.spam_ham_body_prob(email_body) # # Compute the merged probabilities # p_spam = 0.5 * (p_subject_spam + p_body_spam) # p_ham = 0.5 * (p_subject_ham + p_body_ham) # # Decide is the email is spam or ham # if p_spam > p_ham: # return True # else: # return False def spam_ham_body_prob(self, body): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' p_spam = 1.0 p_ham = 1.0 voc_data = self.load_dict() # Walk the text to compute the probability for word in body: # Check the spam probability if word in voc_data["p_body_spam"]: p_spam *= voc_data["p_body_spam"][word] else: p_spam *= 1.0 / (len(voc_data["p_body_spam"]) + 1.0) # Check the ham probability if word in voc_data["p_body_ham"]: p_ham *= voc_data["p_body_ham"][word] else: p_ham *= 1.0 / (len(voc_data["p_body_ham"]) + 1.0) p_spam *= 0.5925 p_ham *= 0.4075 return (p_spam, p_ham) def spam_ham_subject_prob(self, subject): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' p_spam = 1.0 p_ham = 1.0 voc_data = self.load_dict() # Walk the text to compute the probability for word in subject: # Check the spam probability if word in voc_data["p_sub_spam"]: p_spam *= voc_data["p_sub_spam"][word] else: p_spam *= 1.0 / (len(voc_data["p_sub_spam"]) + 1.0) # Check the ham probability if word in voc_data["p_sub_ham"]: p_ham *= voc_data["p_sub_ham"][word] else: p_ham *= 1.0 / (len(voc_data["p_sub_ham"]) + 1.0) p_spam *= 0.5925 p_ham *= 0.4075 return (p_spam, p_ham) def body_spam_ham_prob(self, body, clean_text_mode): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' p_spam_body = 1 p_ham_body = 1 pspam_pham = self.calculate_pspam_pham('train_set.json') p_spam = pspam_pham[0] p_ham = pspam_pham[1] with open(self.vocab) as f: input_file = json.load(f) # body = self.cleaning.clean_text(body) body = self.clean_text(body, clean_text_mode) n_spam_words = len(input_file["p_body_spam"]) n_ham_words = len(input_file["p_body_ham"]) for word in body: if word in input_file["p_body_spam"]: p_spam_body *= input_file["p_body_spam"][word] elif word in input_file["p_body_ham"] or word in input_file["p_sub_ham"] or word in input_file["p_sub_spam"]: p_spam_body *= 1 / (n_spam_words + 1) if word in input_file["p_body_ham"]: p_ham_body *= input_file["p_body_ham"][word] elif word in input_file["p_body_spam"] or word in input_file["p_sub_ham"] or word in input_file["p_sub_spam"]: p_ham_body *= 1 / (n_ham_words + 1) ''' for word in input_file["p_body_spam"]: if word == word: p_spam_body *= input_file["p_body_spam"][word] for word in input_file["p_body_ham"]: if word == word: p_ham_body *= input_file["p_body_ham"][word] ''' return p_spam * p_spam_body, p_ham * p_ham_body def log_p_spam_ham_body(self, body, clean_text_mode): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' p_spam_body = 0 p_ham_body = 0 pspam_pham = self.calculate_pspam_pham('train_set.json') if pspam_pham[0] > 0: p_spam = math.log10(pspam_pham[0]) else: p_spam = pspam_pham[0] if pspam_pham[1] > 0: p_ham = math.log10(pspam_pham[1]) else: p_ham = pspam_pham[1] with open(self.vocab) as inputfile: input_file = json.load(inputfile) body = self.clean_text(body, clean_text_mode) n_spam_words = len(input_file["p_body_spam"]) n_ham_words = len(input_file["p_body_ham"]) for word in body: if word in input_file["p_body_spam"]: if input_file["p_body_spam"][word] > 0: p_spam_body += math.log10(input_file["p_body_spam"][word]) else: p_spam_body += input_file["p_body_spam"][word] elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_sub_ham"]: p_spam_body += math.log10(1 / (n_spam_words + 1)) if word in input_file["p_body_ham"]: if input_file["p_body_ham"][word] > 0: p_ham_body += math.log10(input_file["p_body_ham"][word]) else: p_ham_body += input_file["p_body_ham"][word] elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_body_ham"]: p_ham_body += math.log10(1 / (n_spam_words + 1)) return p_spam + p_spam_body, p_ham + p_ham_body def subject_spam_ham_prob(self, subject, clean_text_mode): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' p_spam_subject = 1 p_ham_subject = 1 pspam_pham = self.calculate_pspam_pham('train_set.json') p_spam = pspam_pham[0] p_ham = pspam_pham[1] with open(self.vocab) as f: input_file = json.load(f) # body = self.cleaning.clean_text(body) clean_sub = self.clean_text(subject, clean_text_mode) n_spam_words = len(input_file["p_sub_spam"]) n_ham_words = len(input_file["p_sub_ham"]) for word in clean_sub: if word in input_file["p_sub_spam"]: p_spam_subject *= input_file["p_sub_spam"][word] elif word in input_file["p_body_spam"] or word in input_file["p_sub_ham"] or word in input_file["p_sub_spam"]: p_spam_subject *= 1 / (n_spam_words + 1) if word in input_file["p_sub_ham"]: p_ham_subject *= input_file["p_sub_ham"][word] elif word in input_file["p_body_spam"] or word in input_file["p_body_ham"] or word in input_file["p_sub_spam"]: p_ham_subject *= 1 / (n_ham_words + 1) return p_spam * p_spam_subject, p_ham * p_ham_subject def log_p_spam_ham_subject(self, subject, clean_text_mode): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' p_spam_subject = 0 p_ham_subject = 0 pspam_pham = self.calculate_pspam_pham('train_set.json') if pspam_pham[0] > 0: p_spam = math.log10(pspam_pham[0]) else: p_spam = pspam_pham[0] if pspam_pham[1] > 0: p_ham = math.log10(pspam_pham[1]) else: p_ham = pspam_pham[1] with open(self.vocab) as inputfile: input_file = json.load(inputfile) subject = self.clean_text(subject, clean_text_mode) n_spam_words = len(input_file["p_sub_spam"]) n_spam_words = len(input_file["p_sub_ham"]) for word in subject: if word in input_file["p_sub_spam"]: if input_file["p_sub_spam"][word] > 0: p_spam_subject += math.log10(input_file["p_sub_spam"][word]) else: p_spam_subject += (input_file["p_sub_spam"][word]) elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_sub_ham"]: p_spam_subject += math.log10(1 / (n_spam_words + 1)) if word in input_file["p_sub_ham"]: if input_file["p_sub_ham"][word] > 0: p_ham_subject += math.log10(input_file["p_sub_ham"][word]) else: p_ham_subject += input_file["p_sub_ham"][word] elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_body_ham"]: p_ham_subject += math.log10(1 / (n_spam_words + 1)) p_spam = p_spam + p_spam_subject p_ham = p_ham + p_ham_subject return p_spam, p_ham def calculate_pspam_pham(self, file): ''' Description: fonction qui calcule la probabilite qu'un message soit spam ou ham dans un fichier Sortie: int, int: P(spam), P(ham) ''' n_spam = 0 n_ham = 0 with open(file) as data: input_file = json.load(data) for email in input_file["dataset"]: mail = email["mail"] is_spam = mail["Spam"] if is_spam == "true": n_spam += 1 else: n_ham += 1 # calculer le nombre d'emails n_emails = len(input_file["dataset"]) return n_spam/n_emails, n_ham/n_emails def clean_text(self, text, mode): return self.cleaning.clean_text(text, mode) def load_dict(self): # Open vocabulary with open(self.vocab) as json_data: vocabu = json.load(json_data) return vocabu