Beispiel #1
0
 def __init__(self):
     self.train_set = "train_set.json"
     self.cleaning = TextCleaning()
     self.vocabulary = "vocabulary.json"
     self.voc_data = {}
Beispiel #2
0
 def __init__(self):
     self.vocab = "vocabulary.json"
     self.cleaning = TextCleaning()
     self.words = VocabularyCreator()
Beispiel #3
0
class VocabularyCreator:
    """Class for creating vocabulary of spam and non-spam messages"""
    def __init__(self):
        self.train_set = "train_set.json"
        self.cleaning = TextCleaning()
        self.vocabulary = "vocabulary.json"
        self.voc_data = {}

    def compute_proba(self, data, word_freq, total):
        '''
        Description: calcul la probabilité de chaque mot du dictionnaire basé sur
        sa fréquence d'occurence
        Sortie: le dictionnaire des probabilité pour chaque mot
        '''
        proba_dict = {}
        for wd in data:
            if data[wd] < 1 or data[wd] > 4:
                continue
            elif data[w] == word_freq:
                proba_dict[wd] = data[wd] / total

        return proba_dict

    def create_vocab(self, word_freq, mode):
        '''
        Description: fonction pour creer le vocabulaire des mots presents
        dans les e-mails spam et ham et le sauvegarder dans le fichier
        vocabulary.json selon le format specifie dans la description de lab
        Sortie: bool, 'True' pour succes, 'False' dans le cas de failure.
        '''
        print("Creating vocabulary")

        dataset = self.load_dict()

        occ_spam_sub = {}
        occ_spam_bod = {}
        occ_ham_sub = {}
        occ_ham_bod = {}

        total_occ_spam_sub = 0
        total_occ_ham_sub = 0
        total_occ_spam_bod = 0
        total_occ_ham_bod = 0

        email_count = len(dataset["dataset"])
        i = 0

        # Analyze each email
        for email in dataset["dataset"]:
            i += 1
            print("\rEmail " + str(i) + "/" + str(email_count), end="")

            # Get data
            data = email["mail"]
            subject = data["Subject"]
            body = data["Body"]
            is_spam = False

            # Update the number of spams / hams
            if data["Spam"] == "true":
                is_spam = True

            # Analyze the subject
            subject = self.cleaning.clean_text(subject, mode)
            if is_spam:
                for wd in subject:
                    total_occ_spam_sub += 1
                    # Add the word to the dictionary or update its occurence count
                    if wd not in occ_spam_sub:
                        occ_spam_sub[wd] = 1
                    else:
                        occ_spam_sub[wd] += 1
            else:
                for wd in subject:
                    total_occ_ham_sub += 1
                    # Add the word to the dictionary or update its occurence count
                    if wd not in occ_ham_sub:
                        occ_ham_sub[wd] = 1
                    else:
                        occ_ham_sub[wd] += 1

            # Analyze the body
            body = self.cleaning.clean_text(body, mode)
            if is_spam:
                for wd in body:
                    total_occ_spam_bod += 1
                    # Add the word to the dictionary or update its occurence count
                    if wd not in occ_spam_bod:
                        occ_spam_bod[wd] = 1
                    else:
                        occ_spam_bod[wd] += 1
            else:
                for wd in body:
                    total_occ_ham_bod += 1
                    # Add the word to the dictionary or update its occurence count
                    if wd not in occ_ham_bod:
                        occ_ham_bod[wd] = 1
                    else:
                        occ_ham_bod[wd] += 1

        # Create the data dictionary
        p_sub_spam = self.compute_proba(occ_spam_sub, word_freq,
                                        total_occ_spam_sub)
        p_sub_ham = self.compute_proba(occ_ham_sub, word_freq,
                                       total_occ_ham_sub)
        p_body_spam = self.compute_proba(occ_spam_bod, word_freq,
                                         total_occ_spam_bod)
        p_body_ham = self.compute_proba(occ_ham_bod, word_freq,
                                        total_occ_ham_bod)
        self.voc_data = {
            "p_sub_spam": p_sub_spam,
            "p_sub_ham": p_sub_ham,
            "p_body_spam": p_body_spam,
            "p_body_ham": p_body_ham
        }

        # Save data
        with open(self.vocabulary, "w") as outfile:
            json.dump(self.voc_data, outfile, indent=4)

        print("\n")
        return True

    def load_dict(self):
        with open(self.train_set) as json_data:
            data_dict = json.load(json_data)
        return data_dict

    def write_data_to_vocab_file(self, vocab):
        try:
            with open(self.vocabulary, "w") as outfile:
                json.dump(vocab, outfile)
                print("Vocab created")
                return True
        except:
            return False

    def clean_text(self, text, mode):
        return self.cleaning.clean_text(text, mode)
Beispiel #4
0
 def __init__(self):
     self.vocab = "vocabulary.json"
     self.cleaning = TextCleaning()
Beispiel #5
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""

    def __init__(self):
        self.vocab = "vocabulary.json"
        self.cleaning = TextCleaning()
        self.words = VocabularyCreator()

    @staticmethod
    def is_spam_function_one(is_msg_spam, user_historic_in_days, user_trust, user_group_trust):
        p = is_msg_spam
        h = user_historic_in_days < 30
        t1 = user_trust < 60
        t2 = user_group_trust < 70
        t3 = user_trust > 75
        result = p and (h and t1 or t2) or h and t2 and not t3
        return result

    @staticmethod
    def is_spam_function_two(is_msg_spam, user_trust, user_group_trust):
        p = is_msg_spam
        t2 = user_group_trust < 70
        t3 = user_trust > 75
        result = p or not t3 and t2
        return result

    def is_spam(self, subject_orig, body_orig, isLogEstimation, isLogCombination, k):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham, 
        donnee le sujet et le texte d'email. 
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''
        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        if (isLogEstimation):
            pSpamSubject, pHamSubject = self.subject_spam_ham_log_prob(subject_orig, pSpam, pHam)
            pSpamBody, pHamBody = self.subject_spam_ham_log_prob(body_orig, pSpam, pHam)
            estimationpSpamSubject = math.log10(pSpam) + pSpamSubject
            estimationpHamSubject = math.log10(pHam) + pHamSubject
            estimationpSpamBody = math.log10(pSpam) + pSpamBody
            estimationpHamBody = math.log10(pHam) + pHamBody
        else:
            pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig)
            pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig)
            estimationpSpamSubject = pSpam * pSpamSubject
            estimationpHamSubject = pHam * pHamSubject
            estimationpSpamBody = pSpam * pSpamBody
            estimationpHamBody = pHam * pHamBody

        if (isLogCombination):
            # s'assurer que l'estimation est strictement plus grand que 0 afin de pouvoir faire le logarithme
            # seul ceux qui sont strictement positif auront appliquer la fonction math.log10
            if (estimationpSpamSubject > 0):
                estimationpSpamSubject = math.log10(estimationpSpamSubject)
            if (estimationpHamSubject > 0):
                estimationpHamSubject = math.log10(estimationpHamSubject)
            if (estimationpSpamBody > 0):
                estimationpSpamBody = math.log10(estimationpSpamBody)
            if (estimationpHamBody > 0):
                estimationpHamBody = math.log10(estimationpHamBody)

        # s'assurer que la valeur de k est entre 0 et 1
        # si elle est plus grand que 1, le rendre en une valeur entre 0 et 1
        if (k > 1):
            k = k / math.pow(10, len(str(k)))
        elif (k < 0):
            k = 0
        # la formule de combinaison de prob est pareil pour les 2 options
        # a l'exception de la valeur des parametres d'estimation qui auront applique le logarithme si approprie
        combinationpSpam = k * estimationpSpamSubject + (1 - k) * estimationpSpamBody
        combinationpHam = k * estimationpHamSubject + (1 - k) * estimationpHamBody

        return combinationpSpam > combinationpHam

    def subject_spam_ham_log_prob(self, subject, pSpam, pHam):
        vocabulary = self.load_dict()

        pSpamSubject = pSpam
        pHamSubject = pHam

        # calcul de probabilite de spam ou ham dans le body
        subject = self.clean_text(subject)
        for word in subject:
            if word in dict(vocabulary['spam_body']):
                pSpamSubject += dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamSubject += dict(vocabulary['ham_body'])[word]

        # logarithme de pSpamBody et pHamBody
        pHamSubject = math.log10(pHamSubject)
        pSpamSubject = math.log10(pSpamSubject)

        if pSpam == pSpamSubject:
            pSpamBody = 0
        elif pHam == pHamSubject:
            pHamBody = 0

        return pSpamBody, pHamBody

    def spam_ham_body_log_prob(self, body, pSpam, pHam):
        vocabulary = self.load_dict()

        pSpamBody = pSpam
        pHamBody = pHam

        # calcul de probabilite de spam ou ham dans le body
        body = self.clean_text(body)
        for word in body:
            if word in dict(vocabulary['spam_body']):
                pSpamBody += dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamBody += dict(vocabulary['ham_body'])[word]

        # logarithme de pSpamBody et pHamBody
        pHamBody = math.log10(pHamBody)
        pSpamBody = math.log10(pSpamBody)

        if pSpam == pSpamBody:
            pSpamBody = 0
        elif pHam == pHamBody:
            pHamBody = 0

        return pSpamBody, pHamBody
    
    def spam_ham_body_prob(self, body):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamBody = pSpam
        pHamBody = pHam

        # calcul de probabilite de spam ou ham dans le body
        body = self.clean_text(body)
        for word in body:
            if word in dict(vocabulary['spam_body']):
                pSpamBody *= dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamBody *= dict(vocabulary['ham_body'])[word]

        if pSpam == pSpamBody:
            pSpamBody = 0
        elif pHam == pHamBody:
            pHamBody = 0

        return pSpamBody, pHamBody

    def subject_spam_ham_prob(self, subject):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamSubject = pSpam
        pHamSubject = pHam

        # calcul de probabilite de spam ou ham dans le sujet
        subject = self.clean_text(subject)
        spam_dict = dict(vocabulary['spam_sub'])
        ham_dict = dict(vocabulary['ham_sub'])
        for word in subject:
            if word in spam_dict:
                pSpamSubject *= spam_dict[word]
            if word in ham_dict:
                pHamSubject *= ham_dict[word]

        if pSpam == pSpamSubject:
            pSpamSubject = 0
        elif pHam == pHamSubject:
            pHamSubject = 0

        return pSpamSubject, pHamSubject

    def calculate_spam_divided_by_email(self):  # pragma: no cover
        return self.words.count_spam() / self.words.count_emails()

    def calculate_ham_divided_by_email(self):  # pragma: no cover
        return self.words.count_ham() / self.words.count_emails()

    def load_dict(self):  # pragma: no cover
        with open(self.vocab) as file:
            vocabulary = json.load(file)

        return vocabulary

    def clean_text(self, text):  # pragma: no cover
        return self.cleaning.clean_text(text, 0)
Beispiel #6
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""
    def __init__(self):
        self.vocab = "vocabulary.json"
        self.cleaning = TextCleaning()
        self.words = VocabularyCreator()

    def is_spam(self, subject_orig, body_orig):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham, 
        donnee le sujet et le texte d'email. 
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''

        pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig)
        pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig)

        # calcul de pSpam et pHam en faisant la moyenne
        pSpam = (pSpamSubject + pSpamBody) / 2
        pHam = (pHamSubject + pHamBody) / 2

        return pSpam > pHam

    def spam_ham_body_prob(self, body):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamBody = pSpam
        pHamBody = pHam

        # calcul de probabilite de spam ou ham dans le body
        body = self.clean_text(body)
        for word in body:
            if word in dict(vocabulary['spam_body']):
                pSpamBody *= dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamBody *= dict(vocabulary['ham_body'])[word]

        if pSpam == pSpamBody:
            pSpamBody = 0
        elif pHam == pHamBody:
            pHamBody = 0

        return pSpamBody, pHamBody

    def subject_spam_ham_prob(self, subject):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamSubject = pSpam
        pHamSubject = pHam

        # calcul de probabilite de spam ou ham dans le sujet
        subject = self.clean_text(subject)
        spam_dict = dict(vocabulary['spam_sub'])
        ham_dict = dict(vocabulary['ham_sub'])
        for word in subject:
            if word in spam_dict:
                pSpamSubject *= spam_dict[word]
            if word in ham_dict:
                pHamSubject *= ham_dict[word]

        if pSpam == pSpamSubject:
            pSpamSubject = 0
        elif pHam == pHamSubject:
            pHamSubject = 0

        return pSpamSubject, pHamSubject

    def calculate_spam_divided_by_email(self):  # pragma: no cover
        return self.words.count_spam() / self.words.count_emails()

    def calculate_ham_divided_by_email(self):  # pragma: no cover
        return self.words.count_ham() / self.words.count_emails()

    def load_dict(self):  # pragma: no cover
        with open(self.vocab) as file:
            vocabulary = json.load(file)

        return vocabulary

    def clean_text(self, text):  # pragma: no cover
        return self.cleaning.clean_text(text)
Beispiel #7
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""
    def __init__(self):
        self.vocab = "vocabulary.json"
        self.cleaning = TextCleaning()

    def clean_text(self, text, mode):  # pragma: no cover
        return self.cleaning.clean_text(text, mode)

    def load_vocab(self):  # pragma: no cover
        with open(self.vocab) as json_data:
            vocab = json.load(json_data)
        return vocab

    def is_spam_with_params(self, subject_orig, body_orig,
                            is_normal_estimation, is_normal_combination,
                            cleaning_mode, k):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham,
        donnee le sujet et le texte d'email.
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''
        if is_normal_estimation:
            prob_message_subject = self.subject_spam_ham_prob(
                subject_orig, cleaning_mode)
            prob_message_body = self.body_spam_ham_prob(
                body_orig, cleaning_mode)
            subject_spam = prob_message_subject[0]
            body_spam = prob_message_body[0]
            subject_ham = prob_message_subject[1]
            body_ham = prob_message_body[1]
        else:
            prob_message_subject = self.subject_spam_ham_prob_log(
                subject_orig, cleaning_mode)
            prob_message_body = self.body_spam_ham_prob_log(
                subject_orig, cleaning_mode)
            subject_spam = math.pow(prob_message_subject[0], 10)
            body_spam = math.pow(prob_message_body[0], 10)
            subject_ham = math.pow(prob_message_subject[1], 10)
            body_ham = math.pow(prob_message_body[1], 10)

        if is_normal_combination:
            prob_message_spam = k * subject_spam + (1 - k) * body_spam
            prob_message_ham = k * subject_ham + (1 - k) * body_ham
        else:
            # il faut traiter les cas ou x dans log(x) est egal a 0 ou tres petit
            if not subject_spam <= 0:
                if not body_spam <= 0:
                    prob_message_spam = k * math.log10(subject_spam) + (
                        1 - k) * math.log10(body_spam)
                else:
                    prob_message_spam = k * math.log10(subject_spam) + (
                        1 - k) * body_spam
            elif not body_spam <= 0:
                prob_message_spam = k * subject_spam + (
                    1 - k) * math.log10(body_spam)
            else:
                prob_message_spam = k * subject_spam + (1 - k) * body_spam

            if not subject_ham <= 0:
                if not body_ham <= 0:
                    prob_message_ham = k * math.log10(subject_ham) + (
                        1 - k) * math.log10(body_ham)
                else:
                    prob_message_ham = math.log10(subject_ham) + (1 -
                                                                  k) * body_ham
            elif body_ham <= 0:
                prob_message_ham = k * subject_ham + (1 -
                                                      k) * math.log10(body_ham)
            else:
                prob_message_ham = subject_ham + (1 - k) * body_ham

        max_prob = max(prob_message_spam, prob_message_ham)

        return True if max_prob == prob_message_spam else False

    def is_spam(self, subject_orig, body_orig):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham,
        donnee le sujet et le texte d'email.
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''

        prob_message_subject = self.subject_spam_ham_prob(subject_orig)
        prob_message_body = self.body_spam_ham_prob(body_orig)

        # equation 7 de l'enonce
        prob_message_spam = (2 / 10 * prob_message_subject[0]) + (
            8 / 10 * prob_message_body[0])
        prob_message_ham = (4 / 10 * prob_message_subject[1]) + (
            6 / 10 * prob_message_body[1])

        #equation 3 de l'enonce
        max_prob = max(prob_message_spam, prob_message_ham)

        return True if max_prob == prob_message_spam else False

    def total_emails(self, file):
        '''
        Description: fonction qui calcule le nombre de courriel
        dans un fichier quelconque.
        Sortie: int; il s'agit du nombre total de messages
        '''

        with open(file) as f:
            input_file = json.load(f)

        email = input_file["dataset"]
        total = len(email)
        return total

    def probability_email_type(self, file):
        '''
        Description: fonction qui calcule la probabilite
        qu'un message soit spam ou ham dans un fichier
        Sortie: int, int: P(spam), P(ham)
        '''

        spam_counter = 0
        ham_counter = 0
        with open(file) as f:
            input_file = json.load(f)

        for email in input_file["dataset"]:
            individual_email = email["mail"]
            spam_bool = individual_email["Spam"]

            if spam_bool == "true":
                spam_counter += 1
            else:
                ham_counter += 1

        total = self.total_emails(file)

        # equation 5: P(spam) = nb de messages spam/nb total de messages
        return spam_counter / total, ham_counter / total

    def body_spam_ham_prob(self, body, cleaning_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        body_spam_prob = 1
        body_ham_prob = 1
        probability_spam_ham = self.probability_email_type('train-emails.json')

        prob_spam = probability_spam_ham[0]
        prob_ham = probability_spam_ham[1]

        with open(self.vocab) as f:
            input_file = json.load(f)

        # clean_body = self.cleaning.clean_text(body)
        clean_body = self.clean_text(body, cleaning_mode)

        num_words_spam_body = len(input_file["spam_body"])
        num_words_ham_body = len(input_file["ham_body"])

        for body_word in clean_body:
            if body_word in input_file["spam_body"]:
                body_spam_prob *= input_file["spam_body"][body_word]
            elif body_word in input_file["ham_body"] or body_word in input_file[
                    "ham_sub"] or body_word in input_file["spam_sub"]:
                body_spam_prob *= 1 / (num_words_spam_body + 1)

            if body_word in input_file["ham_body"]:
                body_ham_prob *= input_file["ham_body"][body_word]
            elif body_word in input_file[
                    "spam_body"] or body_word in input_file[
                        "ham_sub"] or body_word in input_file["spam_sub"]:
                body_ham_prob *= 1 / (num_words_ham_body + 1)
            '''
            for word in input_file["spam_body"]:
                if body_word == word:
                    body_spam_prob *= input_file["spam_body"][word]
            for word in input_file["ham_body"]:
                if body_word == word:
                    body_ham_prob *= input_file["ham_body"][word]
            '''

        return prob_spam * body_spam_prob, prob_ham * body_ham_prob

    def body_spam_ham_prob_log(self, body, cleaning_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        body_spam_prob = 0
        body_ham_prob = 0
        probability_spam_ham = self.probability_email_type('train-emails.json')

        if probability_spam_ham[0] > 0:
            prob_spam = math.log10(probability_spam_ham[0])
        else:
            prob_spam = probability_spam_ham[0]

        if probability_spam_ham[1] > 0:
            prob_ham = math.log10(probability_spam_ham[1])
        else:
            prob_ham = probability_spam_ham[1]

        with open(self.vocab) as f:
            input_file = json.load(f)

        # clean_body = self.cleaning.clean_text(body)
        clean_body = self.clean_text(body, cleaning_mode)

        num_words_spam_body = len(input_file["spam_body"])
        num_words_ham_body = len(input_file["ham_body"])

        for body_word in clean_body:
            if body_word in input_file["spam_body"]:
                if input_file["spam_body"][body_word] > 0:
                    # pas certaine de comment modifier cette partie, pour l'instant += math.log10()()
                    body_spam_prob += math.log10(
                        input_file["spam_body"][body_word])
                else:
                    body_spam_prob += input_file["spam_body"][body_word]

            elif body_word in input_file["ham_body"] or body_word in input_file[
                    "ham_sub"] or body_word in input_file["spam_sub"]:
                body_spam_prob += math.log10(1 / (num_words_spam_body + 1))

            if body_word in input_file["ham_body"]:
                if input_file["ham_body"][body_word] > 0:
                    body_ham_prob += math.log10(
                        input_file["ham_body"][body_word])
                else:
                    body_ham_prob += input_file["ham_body"][body_word]

            elif body_word in input_file[
                    "spam_body"] or body_word in input_file[
                        "ham_sub"] or body_word in input_file["spam_sub"]:
                #pareil
                body_ham_prob += math.log10(1 / (num_words_ham_body + 1))
            '''
            for word in input_file["spam_body"]:
                if body_word == word:
                    body_spam_prob *= input_file["spam_body"][word]
            for word in input_file["ham_body"]:
                if body_word == word:
                    body_ham_prob *= input_file["ham_body"][word]
            '''

        return prob_spam + body_spam_prob, prob_ham + body_ham_prob

    def subject_spam_ham_prob(self, subject, cleaning_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        sub_spam_prob = 1
        sub_ham_prob = 1

        probability_spam_ham = self.probability_email_type('train-emails.json')

        prob_spam = probability_spam_ham[0]
        prob_ham = probability_spam_ham[1]

        with open(self.vocab) as f:
            input_file = json.load(f)

        # clean_body = self.cleaning.clean_text(body)
        clean_sub = self.clean_text(subject, cleaning_mode)

        num_words_spam_sub = len(input_file["spam_sub"])
        num_words_ham_sub = len(input_file["ham_sub"])

        for subject_word in clean_sub:
            if subject_word in input_file["spam_sub"]:
                sub_spam_prob *= input_file["spam_sub"][subject_word]
            elif subject_word in input_file[
                    "spam_body"] or subject_word in input_file[
                        "ham_sub"] or subject_word in input_file["spam_sub"]:
                sub_spam_prob *= 1 / (num_words_spam_sub + 1)

            if subject_word in input_file["ham_sub"]:
                sub_ham_prob *= input_file["ham_sub"][subject_word]
            elif subject_word in input_file[
                    "spam_body"] or subject_word in input_file[
                        "ham_body"] or subject_word in input_file["spam_sub"]:
                sub_ham_prob *= 1 / (num_words_ham_sub + 1)
            '''
            for word in input_file["spam_sub"]:
                if subject_word == word:
                    sub_spam_prob *= input_file["spam_sub"][word]
            for word in input_file["ham_sub"]:
                if subject_word == word:
                    sub_ham_prob *= input_file["ham_sub"][word]
            '''

        return prob_spam * sub_spam_prob, prob_ham * sub_ham_prob

    def subject_spam_ham_prob_log(self, subject, cleaning_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        sub_spam_prob = 0
        sub_ham_prob = 0

        probability_spam_ham = self.probability_email_type('train-emails.json')

        prob_spam = math.log10(probability_spam_ham[0])
        prob_ham = math.log10(probability_spam_ham[1])

        with open(self.vocab) as f:
            input_file = json.load(f)

        # clean_body = self.cleaning.clean_text(body)
        clean_sub = self.clean_text(subject, cleaning_mode)

        num_words_spam_sub = len(input_file["spam_sub"])
        num_words_ham_sub = len(input_file["ham_sub"])

        for subject_word in clean_sub:
            if subject_word in input_file["spam_sub"]:
                if input_file["spam_sub"][subject_word] > 0:
                    sub_spam_prob += math.log10(
                        input_file["spam_sub"][subject_word])
                else:
                    sub_spam_prob += (input_file["spam_sub"][subject_word])

            #pas certaine de comment modifier cette partie, pour l'instant += math.log10()()
            elif subject_word in input_file[
                    "spam_body"] or subject_word in input_file[
                        "ham_sub"] or subject_word in input_file["spam_sub"]:
                sub_spam_prob += math.log10(1 / (num_words_spam_sub + 1))

            if subject_word in input_file["ham_sub"]:
                if input_file["ham_sub"][subject_word] > 0:
                    sub_ham_prob += math.log10(
                        input_file["ham_sub"][subject_word])
                else:
                    sub_ham_prob += input_file["ham_sub"][subject_word]

            #pareil
            elif subject_word in input_file[
                    "spam_body"] or subject_word in input_file[
                        "ham_body"] or subject_word in input_file["spam_sub"]:
                sub_ham_prob += math.log10(1 / (num_words_ham_sub + 1))

        return prob_spam + sub_spam_prob, prob_ham + sub_ham_prob
Beispiel #8
0
class VocabularyCreator:
    """Class for creating vocabulary of spam and non-spam messages"""
    def __init__(self):
        self.train_set = "800-mails.json"
        self.cleaning = TextCleaning()
        self.vocabulary = "vocabulary.json"

    def create_vocab(self):
        '''
        Description: fonction pour creer le vocabulaire des mots presents
        dans les e-mails spam et ham et le sauvegarder dans le fichier
        vocabulary.json selon le format specifie dans la description de lab
        Sortie: bool, 'True' pour succes, 'False' dans le cas de failure.
        '''
        try:
            with open(self.train_set, "r") as read_file:
                emails = json.load(read_file)
        except:
            print("ERROR IN READING FILE")
            raise Exception

        spamSubjects = []
        hamSubjects = []
        spamBodies = []
        hamBodies = []
        vocabulary = {
            "spam_sub": {},
            "ham_sub": {},
            "spam_body": {},
            "ham_body": {}
        }

        #ajout des sujets et des corps des emails de 800-mails.json dans le tableau respectif
        for email in emails["dataset"]:
            if email["mail"]["Spam"] == "true":
                spamSubjects.extend(
                    self.cleaning.clean_text(email["mail"]["Subject"]))
                spamBodies.extend(
                    self.cleaning.clean_text(email["mail"]["Body"]))
            else:
                hamSubjects.extend(
                    self.cleaning.clean_text(email["mail"]["Subject"]))
                hamBodies.extend(
                    self.cleaning.clean_text(email["mail"]["Body"]))

        spam_sub = list(dict.fromkeys(spamSubjects))
        ham_sub = list(dict.fromkeys(hamSubjects))
        spam_body = list(dict.fromkeys(spamBodies))
        ham_body = list(dict.fromkeys(hamBodies))

        #calcul des probabilites spam et ham pour chaque mots
        for word in spam_sub:
            vocabulary["spam_sub"][word] = spamSubjects.count(word) / len(
                spamSubjects)
        for word in ham_sub:
            vocabulary["ham_sub"][word] = hamSubjects.count(word) / len(
                hamSubjects)
        for word in spam_body:
            vocabulary["spam_body"][word] = spamBodies.count(word) / len(
                spamBodies)
        for word in ham_body:
            vocabulary["ham_body"][word] = hamBodies.count(word) / len(
                hamBodies)

        try:
            with open(self.vocabulary, "w") as write_file:
                json.dump(vocabulary, write_file, indent=1)
            return True
        except:
            return False

    #fonction qui retourne le nombre de courriel spam
    def count_spam(self):

        with open(self.train_set) as read_file:
            emails = json.load(read_file)

        nSpams = 0
        for email in emails["dataset"]:
            if email["mail"]["Spam"] == "true":
                nSpams += 1
        return nSpams

    #fonction qui retourne le nombre de courriel
    def count_emails(self):
        with open(self.train_set) as read_file:
            emails = json.load(read_file)

        return len(emails["dataset"])

    #fonction qui retourne le nombre de courriel ham
    def count_ham(self):
        nHams = self.count_spam() - self.count_emails()
        return nHams
Beispiel #9
0
 def __init__(self):
     self.train_set = "800-mails.json"
     self.cleaning = TextCleaning()
     self.vocabulary = "vocabulary.json"
Beispiel #10
0
class VocabularyCreator:
    """Class for creating vocabulary of spam and non-spam messages"""
    def __init__(self):
        self.train_set = "800-mails.json"
        self.cleaning = TextCleaning()
        self.vocabulary = "vocabulary.json"

    def create_vocab(self, minWordFreq, clean_text_mode):
        '''
        Description: fonction pour creer le vocabulaire des mots presents
        dans les e-mails spam et ham et le sauvegarder dans le fichier
        vocabulary.json selon le format specifie dans la description de lab
        Sortie: bool, 'True' pour succes, 'False' dans le cas de failure.
        '''
        emails = self.load_dict()

        spamSubjects = []
        hamSubjects = []
        spamBodies = []
        hamBodies = []
        vocabulary = {
            "spam_sub": {},
            "ham_sub": {},
            "spam_body": {},
            "ham_body": {}
        }

        # ajout des sujets et des corps des emails de 800-mails.json dans le tableau respectif
        for email in emails["dataset"]:
            if email["mail"]["Spam"] == "true":
                spamSubjects.extend(
                    self.clean_text(email["mail"]["Subject"], clean_text_mode))
                spamBodies.extend(
                    self.clean_text(email["mail"]["Body"], clean_text_mode))
            else:
                hamSubjects.extend(
                    self.clean_text(email["mail"]["Subject"], clean_text_mode))
                hamBodies.extend(
                    self.clean_text(email["mail"]["Body"], clean_text_mode))

        spam_sub = list(dict.fromkeys(spamSubjects))
        ham_sub = list(dict.fromkeys(hamSubjects))
        spam_body = list(dict.fromkeys(spamBodies))
        ham_body = list(dict.fromkeys(hamBodies))

        # calcul des probabilites spam et ham pour chaque mots
        if (minWordFreq > 4):
            minWordFreq = 4
        elif (minWordFreq < 1):
            minWordFreq = 1

        for word in spam_sub:
            if (spamSubjects.count(word) >= minWordFreq):
                vocabulary["spam_sub"][word] = spamSubjects.count(word) / len(
                    spamSubjects)
        for word in ham_sub:
            if (hamSubjects.count(word) >= minWordFreq):
                vocabulary["ham_sub"][word] = hamSubjects.count(word) / len(
                    hamSubjects)
        for word in spam_body:
            if (spamBodies.count(word) >= minWordFreq):
                vocabulary["spam_body"][word] = spamBodies.count(word) / len(
                    spamBodies)
        for word in ham_body:
            if (hamBodies.count(word) >= minWordFreq):
                vocabulary["ham_body"][word] = hamBodies.count(word) / len(
                    hamBodies)

        self.write_data_to_vocab_file(vocabulary)
        return vocabulary

    # fonction qui retourne le nombre de courriel spam
    def count_spam(self):

        emails = self.load_dict()

        nSpams = 0
        for email in emails["dataset"]:
            if email["mail"]["Spam"] == "true":
                nSpams += 1
        return nSpams

    # fonction qui retourne le nombre de courriel
    def count_emails(self):
        emails = self.load_dict()

        return len(emails["dataset"])

    # fonction qui retourne le nombre de courriel ham
    def count_ham(self):
        nHams = self.count_emails() - self.count_spam()
        return nHams

    def load_dict(self):  # pragma: no cover
        with open(self.train_set) as json_data:
            data_dict = json.load(json_data)
        return data_dict

    def write_data_to_vocab_file(self, vocab):  # pragma: no cover
        try:
            with open(self.vocabulary, 'w') as outfile:
                json.dump(vocab, outfile)
                print('vocabulary created......')
                return True
        except:
            return False

    def clean_text(self, text, cleaning_mode):  # pragma: no cover
        return self.cleaning.clean_text(text, cleaning_mode)
Beispiel #11
0
class VocabularyCreator:
    """Class for creating vocabulary of spam and non-spam messages"""
    def __init__(self):
        self.train_set = "1000-mails.json"
        self.cleaning = TextCleaning()
        self.vocabulary = "vocabulary.json"

    def load_dict(self, file):  # pragma: no cover
        with open(file) as json_data:
            data_dict = json.load(json_data)
        return data_dict

    def write_data_to_vocab_file(self, vocab):  # pragma: no cover
        try:
            with open(self.vocabulary, "w") as outfile:
                json.dump(vocab, outfile)
                print("Vocabulary created...")
                return True
        except:
            return False

    def clean_text(self, text, option):  # pragma: no cover
        return self.cleaning.clean_text(text, option)

    def total_words_spam_ham_section(self, file, section, cleaning_option):
        '''
        Description: fonction pour calculer le nombre de mots dans les
        e-mails spam et ham, et pour recueillir les mots de chaque type 
        de e-mail.
        Sortie: int, int, list, list; les deux premières pour le nombre
        de mots dans chaque type de courriel, les deux dernières retou-
        rnent une liste de mots regroupée pour chaque type.
        '''

        spam_section_words = []
        ham_section_words = []

        input_file = self.load_dict(file)

        for email in input_file["dataset"]:
            individual_email = email["mail"]
            spam_bool = individual_email["Spam"]
            clean_subject = self.cleaning.clean_text(
                (individual_email[section]), cleaning_option)

            if spam_bool == "true": spam_section_words += clean_subject
            else: ham_section_words += clean_subject

        return len(spam_section_words), len(
            ham_section_words), spam_section_words, ham_section_words

    def probability_email_type_section_words(self, file, section, email_type,
                                             word_frequency, cleaning_option):
        '''
        Description: fonction pour calculer la probabilite de chaque mot
        d'une certaine section du courriel (subject ou body) d'un certain 
        type de courriel (spam ou ham). En d'autres mots, cette fonction 
        a pour populer les quatre types de vocabulaire (spam_sub, ham_sub, ...) 
        dans vocabullary.json
        Sortie: dictionnaire de mots
        '''

        word_probability_dict = {}

        email_type_section_information = self.total_words_spam_ham_section(
            file, section, cleaning_option)

        if email_type == "Spam":
            total_words_email_type_section = email_type_section_information[0]
            list_words_email_type_section = email_type_section_information[2]
        else:
            total_words_email_type_section = email_type_section_information[1]
            list_words_email_type_section = email_type_section_information[3]

        for word in list_words_email_type_section:
            same_word_counter = sum(
                word == analyzed_word
                for analyzed_word in list_words_email_type_section)
            word_probability = same_word_counter / total_words_email_type_section
            if same_word_counter >= word_frequency:
                word_probability_dict[word] = round(word_probability, 4)

        return word_probability_dict

    def create_vocab(self, word_frequency, cleaning_option):
        '''
        Description: fonction pour creer le vocabulaire des mots presents
        dans les e-mails spam et ham et le sauvegarder dans le fichier
        vocabulary.json selon le format specifie dans la description de lab
        Sortie: bool, 'True' pour succes, 'False' dans le cas de failure.
        '''

        try:
            if word_frequency < 1 or word_frequency > 4:
                return False

            vocab_dict = {}

            vocab_dict["spam_sub"] = self.probability_email_type_section_words(
                self.train_set, "Subject", "Spam", word_frequency,
                cleaning_option)
            vocab_dict["ham_sub"] = self.probability_email_type_section_words(
                self.train_set, "Subject", "Ham", word_frequency,
                cleaning_option)
            vocab_dict[
                "spam_body"] = self.probability_email_type_section_words(
                    self.train_set, "Body", "Spam", word_frequency,
                    cleaning_option)
            vocab_dict["ham_body"] = self.probability_email_type_section_words(
                self.train_set, "Body", "Ham", word_frequency, cleaning_option)

            self.write_data_to_vocab_file(vocab_dict)

            return vocab_dict

        except FileNotFoundError as e:
            print("Error!", e.__class__, "occurred.")
            print("File", e.filename, "was not found")
            return False
Beispiel #12
0
 def __init__(self, vocab_file):
     self.vocab    = vocab_file
     self.cleaning = TextCleaning()
     self.voc_data = {}
Beispiel #13
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""

    def __init__(self, vocab_file):
        self.vocab    = vocab_file
        self.cleaning = TextCleaning()
        self.voc_data = {}

    def is_spam(self, subject_orig, body_orig):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham, 
        donnee le sujet et le texte d'email. 
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''
        # Clean email's subject and body
        email_subject = self.clean_text(subject_orig)
        email_body    = self.clean_text(body_orig)

        # Get the spam/ham probabilities
        p_subject_spam, p_subject_ham = self.spam_ham_subject_prob(email_subject)
        p_body_spam,    p_body_ham    = self.spam_ham_body_prob(email_body)

        # Compute the merged probabilities
        p_spam = 0.5 * (p_subject_spam + p_body_spam)
        p_ham  = 0.5 * (p_subject_ham  + p_body_ham)      

        # Decide is the email is spam or ham
        if p_spam > p_ham:
            return True 
        else:
            return False

    def spam_ham_body_prob(self, body):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''
        p_spam = 1.0
        p_ham  = 1.0

        voc_data = self.load_dict()


        # Walk the text to compute the probability
        for word in body:
            # Check the spam probability
            if word in voc_data["p_body_spam"]:
                p_spam *= voc_data["p_body_spam"][word]
            else:
                p_spam *= 1.0 / (len(voc_data["p_body_spam"]) + 1.0)
            
            # Check the ham probability
            if word in voc_data["p_body_ham"]:
                p_ham *= voc_data["p_body_ham"][word]
            else:
                p_ham *= 1.0 / (len(voc_data["p_body_ham"]) + 1.0)

        p_spam *= 0.5925
        p_ham  *= 0.4075

        return (p_spam, p_ham)

    def spam_ham_subject_prob(self, subject):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''
        p_spam = 1.0
        p_ham  = 1.0

        voc_data = self.load_dict()

        # Walk the text to compute the probability
        for word in subject:
            # Check the spam probability
            if word in voc_data["p_sub_spam"]:
                p_spam *= voc_data["p_sub_spam"][word]
            else:
                p_spam *= 1.0 / (len(voc_data["p_sub_spam"]) + 1.0)
            
            # Check the ham probability
            if word in voc_data["p_sub_ham"]:
                p_ham *= voc_data["p_sub_ham"][word]
            else:
                p_ham *= 1.0 / (len(voc_data["p_sub_ham"]) + 1.0)

        p_spam *= 0.5925
        p_ham  *= 0.4075

        return (p_spam, p_ham)
    
    def clean_text(self, text):
        return self.cleaning.clean_text(text)

    def load_dict(self):
        # Open vocabulary 
        with open(self.vocab) as json_data:
            vocabu = json.load(json_data)
        
        return vocabu
Beispiel #14
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""

    def __init__(self):
        self.vocab    = "vocabulary.json"
        self.cleaning = TextCleaning()
        self.voc_data = {}



    def is_spam(self, subject_orig, body_orig, is_log_estimation, is_log_combination, clean_text_mode, k):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham, 
        donnee le sujet et le texte d'email. 
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''
        
        if is_log_estimation:
            p_subject = self.log_p_spam_ham_subject(subject_orig, clean_text_mode)
            p_body = self.log_p_spam_ham_body(subject_orig, clean_text_mode)
            p_spam_subject = math.pow(p_subject[0], 10)
            p_spam_body = math.pow(p_body[0], 10)
            p_ham_subject = math.pow(p_subject[1], 10)
            p_ham_body = math.pow(p_body[1], 10)
        else:
            p_subject = self.subject_spam_ham_prob(subject_orig, clean_text_mode)
            p_body = self.body_spam_ham_prob(body_orig, clean_text_mode)
            p_spam_subject = p_subject[0]
            p_spam_body = p_body[0]
            p_ham_subject = p_subject[1]
            p_ham_body = p_body[1]

        if is_log_combination:
            if p_spam_subject > 0:
                if p_spam_body > 0:     #case where pspam_subject and pspam_body are both positive
                    p_spam = k * math.log10(p_spam_subject) + (1 - k) * math.log10(p_spam_body)
                else:                   #case where pspam_subject is positive and pspam_body is negative
                    p_spam = k * math.log10(p_spam_subject) + (1 - k) * p_spam_body
            elif p_spam_body > 0:       #case where pspam_subject is negative and pspam_body is positive
                p_spam = k*p_spam_subject + (1 - k) * math.log10(p_spam_body)
            else:                       #case where pspam_subject and pspam_body are both negative
                p_spam = k * p_spam_subject + (1 - k) * p_spam_body

            if p_ham_subject > 0:
                if p_ham_body > 0:
                    p_ham = k * math.log10(p_ham_subject) + (1 - k) * math.log10(p_ham_body)
                else:
                    p_ham = math.log10(p_ham_subject) + (1 - k) * p_ham_body
            elif p_ham_body <= 0:
                p_ham = k * p_ham_subject + (1 - k) * math.log10(p_ham_body)
            else:
                p_ham = p_ham_subject + (1 - k) * p_ham_body
        else:
            p_spam = k * p_spam_subject + (1 - k) * p_spam_body
            p_ham = k * p_ham_subject + (1 - k) * p_ham_body

        if p_spam > p_ham:
            return True
        
        return False

    # def is_spam(self, subject_orig, body_orig):
    #     '''
    #     Description: fonction pour verifier si e-mail est spam ou ham,
    #     en calculant les probabilites d'etre spam et ham, 
    #     donnee le sujet et le texte d'email. 
    #     Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
    #     '''
    #     # Clean email's subject and body
    #     email_subject = self.clean_text(subject_orig)
    #     email_body    = self.clean_text(body_orig)

    #     # Get the spam/ham probabilities
    #     p_subject_spam, p_subject_ham = self.spam_ham_subject_prob(email_subject)
    #     p_body_spam,    p_body_ham    = self.spam_ham_body_prob(email_body)

    #     # Compute the merged probabilities
    #     p_spam = 0.5 * (p_subject_spam + p_body_spam)
    #     p_ham  = 0.5 * (p_subject_ham  + p_body_ham)      

    #     # Decide is the email is spam or ham
    #     if p_spam > p_ham:
    #         return True 
    #     else:
    #         return False

    def spam_ham_body_prob(self, body):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''
        p_spam = 1.0
        p_ham  = 1.0

        voc_data = self.load_dict()


        # Walk the text to compute the probability
        for word in body:
            # Check the spam probability
            if word in voc_data["p_body_spam"]:
                p_spam *= voc_data["p_body_spam"][word]
            else:
                p_spam *= 1.0 / (len(voc_data["p_body_spam"]) + 1.0)
            
            # Check the ham probability
            if word in voc_data["p_body_ham"]:
                p_ham *= voc_data["p_body_ham"][word]
            else:
                p_ham *= 1.0 / (len(voc_data["p_body_ham"]) + 1.0)

        p_spam *= 0.5925
        p_ham  *= 0.4075

        return (p_spam, p_ham)

    def spam_ham_subject_prob(self, subject):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''
        p_spam = 1.0
        p_ham  = 1.0

        voc_data = self.load_dict()

        # Walk the text to compute the probability
        for word in subject:
            # Check the spam probability
            if word in voc_data["p_sub_spam"]:
                p_spam *= voc_data["p_sub_spam"][word]
            else:
                p_spam *= 1.0 / (len(voc_data["p_sub_spam"]) + 1.0)
            
            # Check the ham probability
            if word in voc_data["p_sub_ham"]:
                p_ham *= voc_data["p_sub_ham"][word]
            else:
                p_ham *= 1.0 / (len(voc_data["p_sub_ham"]) + 1.0)

        p_spam *= 0.5925
        p_ham  *= 0.4075

        return (p_spam, p_ham)

    def body_spam_ham_prob(self, body, clean_text_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        p_spam_body = 1
        p_ham_body = 1
        pspam_pham = self.calculate_pspam_pham('train_set.json')

        p_spam = pspam_pham[0]
        p_ham = pspam_pham[1]

        with open(self.vocab) as f:
            input_file = json.load(f)

        # body = self.cleaning.clean_text(body)
        body = self.clean_text(body, clean_text_mode)

        n_spam_words = len(input_file["p_body_spam"])
        n_ham_words = len(input_file["p_body_ham"])

        for word in body:
            if word in input_file["p_body_spam"]:
                p_spam_body *= input_file["p_body_spam"][word]
            elif word in input_file["p_body_ham"] or word in input_file["p_sub_ham"] or word in input_file["p_sub_spam"]:
                p_spam_body *= 1 / (n_spam_words + 1)

            if word in input_file["p_body_ham"]:
                p_ham_body *= input_file["p_body_ham"][word]
            elif word in input_file["p_body_spam"] or word in input_file["p_sub_ham"] or word in input_file["p_sub_spam"]:
                p_ham_body *= 1 / (n_ham_words + 1)
            '''
            for word in input_file["p_body_spam"]:
                if word == word:
                    p_spam_body *= input_file["p_body_spam"][word]
            for word in input_file["p_body_ham"]:
                if word == word:
                    p_ham_body *= input_file["p_body_ham"][word]
            '''

        return p_spam * p_spam_body, p_ham * p_ham_body

    def log_p_spam_ham_body(self, body, clean_text_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        p_spam_body = 0
        p_ham_body = 0

        pspam_pham = self.calculate_pspam_pham('train_set.json')

        if pspam_pham[0] > 0:
            p_spam = math.log10(pspam_pham[0])
        else:
            p_spam = pspam_pham[0]

        if pspam_pham[1] > 0:
            p_ham = math.log10(pspam_pham[1])
        else:
            p_ham = pspam_pham[1]

        with open(self.vocab) as inputfile:
            input_file = json.load(inputfile)

        body = self.clean_text(body, clean_text_mode)

        n_spam_words = len(input_file["p_body_spam"])
        n_ham_words = len(input_file["p_body_ham"])

        for word in body:
            if word in input_file["p_body_spam"]:
                if input_file["p_body_spam"][word] > 0:
                    p_spam_body += math.log10(input_file["p_body_spam"][word])
                else:
                    p_spam_body += input_file["p_body_spam"][word]

            elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_sub_ham"]:
                p_spam_body += math.log10(1 / (n_spam_words + 1))

            if word in input_file["p_body_ham"]:
                if input_file["p_body_ham"][word] > 0:
                    p_ham_body += math.log10(input_file["p_body_ham"][word])
                else:
                    p_ham_body += input_file["p_body_ham"][word]

            elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_body_ham"]:
                p_ham_body += math.log10(1 / (n_spam_words + 1))

        return p_spam + p_spam_body, p_ham + p_ham_body

    def subject_spam_ham_prob(self, subject, clean_text_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        p_spam_subject = 1
        p_ham_subject = 1

        pspam_pham = self.calculate_pspam_pham('train_set.json')

        p_spam = pspam_pham[0]
        p_ham = pspam_pham[1]

        with open(self.vocab) as f:
            input_file = json.load(f)

        # body = self.cleaning.clean_text(body)
        clean_sub = self.clean_text(subject, clean_text_mode)

        n_spam_words = len(input_file["p_sub_spam"])
        n_ham_words = len(input_file["p_sub_ham"])

        for word in clean_sub:
            if word in input_file["p_sub_spam"]:
                p_spam_subject *= input_file["p_sub_spam"][word]
            elif word in input_file["p_body_spam"] or word in input_file["p_sub_ham"] or word in input_file["p_sub_spam"]:
                p_spam_subject *= 1 / (n_spam_words + 1)

            if word in input_file["p_sub_ham"]:
                p_ham_subject *= input_file["p_sub_ham"][word]
            elif word in input_file["p_body_spam"] or word in input_file["p_body_ham"] or word in input_file["p_sub_spam"]:
                p_ham_subject *= 1 / (n_ham_words + 1)

        return p_spam * p_spam_subject, p_ham * p_ham_subject


    def log_p_spam_ham_subject(self, subject, clean_text_mode):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        p_spam_subject = 0
        p_ham_subject = 0

        pspam_pham = self.calculate_pspam_pham('train_set.json')

        if pspam_pham[0] > 0:
            p_spam = math.log10(pspam_pham[0])
        else:
            p_spam = pspam_pham[0]
        
        if pspam_pham[1] > 0:    
            p_ham = math.log10(pspam_pham[1])
        else:
            p_ham = pspam_pham[1]

        with open(self.vocab) as inputfile:
            input_file = json.load(inputfile)

        subject = self.clean_text(subject, clean_text_mode)

        n_spam_words = len(input_file["p_sub_spam"])
        n_spam_words = len(input_file["p_sub_ham"])

        for word in subject:
            if word in input_file["p_sub_spam"]:
                if input_file["p_sub_spam"][word] > 0:
                    p_spam_subject += math.log10(input_file["p_sub_spam"][word])
                else:
                    p_spam_subject += (input_file["p_sub_spam"][word])

            elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_sub_ham"]:
                p_spam_subject += math.log10(1 / (n_spam_words + 1))

            if word in input_file["p_sub_ham"]:
                if input_file["p_sub_ham"][word] > 0:
                    p_ham_subject += math.log10(input_file["p_sub_ham"][word])
                else:
                    p_ham_subject += input_file["p_sub_ham"][word]

            elif word in input_file["p_sub_spam"] or word in input_file["p_body_spam"] or word in input_file["p_body_ham"]:
                p_ham_subject += math.log10(1 / (n_spam_words + 1))
        
        p_spam = p_spam + p_spam_subject
        p_ham = p_ham + p_ham_subject

        return p_spam, p_ham
    
    def calculate_pspam_pham(self, file):
        '''
        Description: fonction qui calcule la probabilite
        qu'un message soit spam ou ham dans un fichier
        Sortie: int, int: P(spam), P(ham)
        '''
        n_spam = 0
        n_ham = 0

        with open(file) as data:
            input_file = json.load(data)

        for email in input_file["dataset"]:
            mail = email["mail"]
            is_spam = mail["Spam"]

            if is_spam == "true":
                n_spam += 1
            else:
                n_ham += 1

        # calculer le nombre d'emails
        n_emails = len(input_file["dataset"])

        return n_spam/n_emails, n_ham/n_emails

    def clean_text(self, text, mode):
        return self.cleaning.clean_text(text, mode)

    def load_dict(self):
        # Open vocabulary 
        with open(self.vocab) as json_data:
            vocabu = json.load(json_data)
        
        return vocabu