Esempi in Python per Corpus.emails

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: corpus

Classe/tipologia: Corpus

Metodo/funzione: emails

Esempi su hotexamples.com: 20

Corpus.emails in Python: 20 esempi trovati. Questi sono i migliori esempi reali in Python per corpus.Corpus.emails, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Corpus(30)

emails(15)

__init__(13)

save_to_file(4)

add_extra_info(4)

buildCorpusOnDB(3)

concetenate_corpus(3)

connect_to(3)

add_instance(3)

accuracy_corpus(3)

pop_instance(3)

emails_as_string(2)

convert_dictionary_to_words2vec(2)

construir_corpus(2)

construct_SentencesAndSPerItem(2)

construct_QAnswersAndQPerItem(2)

fit_features(2)

epoch_flag(2)

generate_batch_from_file(2)

export(2)

calculate_primary_targets(2)

Calculate_PairWiseFeature(2)

getTweets(2)

add_file(2)

add(2)

get_sentences(2)

add_document(2)

fit(2)

fill(1)

negativeWordDict(1)

representations(1)

read(1)

fit_dictionary(1)

query_by_id(1)

fit_matrix(1)

freeze(1)

positiveWordDict(1)

parse_xml(1)

getNumericDictionary(1)

from_config(1)

from_dict(1)

loadVocabulary(1)

generate_corpus_from_graph_using_random_walk(1)

instances(1)

getNumOfSampleDocs(1)

getRanges(1)

full_targets(1)

gaussian_model(1)

getAttributeVal(1)

gen_batch(1)

Esempio n. 1

Mostra file

File: filter.py Progetto: skalahonza/SpamFilter

 def test(self, directory):
     corp = Corpus(directory)
     result = {}
     if self.trained:
         for fname, body in corp.emails():
             SpamSum = 0
             HamSum = 0
             examined = Email(fname, body)
             for spam in self.spams:
                 SpamSum += Email.compare_emails(examined, spam)
             for ham in self.hams:
                 HamSum += Email.compare_emails(examined, ham)
             if SpamSum / len(self.spams) > HamSum / len(self.hams):
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)
     else:
         for fname, body in corp.emails():
             counter = 0
             for word in self.blacklist:
                 if word in body:
                     counter += 1
             if counter > 3:
                 result[fname] = 'SPAM'
             else:
                 result[fname] = 'OK'
         write_classification_to_file(
             directory + os.path.sep + '!prediction.txt', result)

Esempio n. 2

Mostra file

 def test(self, test_corpus_dir):
     """
     Tests given emails for being SPAM or HAM
     :param test_corpus_dir: directory with emails
     """
     test_corpus = Corpus(
         test_corpus_dir)  # instance of corpus for walking emails
     self.load_from_memory(
     )  # load memory, in case no training was made, if training was made it makes it better
     for name, msg in test_corpus.emails(
     ):  # for name of file and the email
         values = []
         for a in msg.keys():  # for all email header parts
             a = a.lower()
             if self.classification.get(a):
                 values.append(
                     self.get_stat(a, msg)
                 )  # get status of this part, values closer to 1 mean SPAM
         values.append(
             self.get_stat_payload(msg))  # get status of body/payload
         final_stat = sum(values) / (len(values))  # get final value
         if final_stat > 0.5:  # if value is closer to being SPAM
             self.predictions[name] = 'SPAM'
         else:  # if value is close to HAM
             self.predictions[name] = 'OK'
     utils.write_classification_to_file(
         test_corpus_dir, self.predictions)  # safe created classification
     self.save_to_memory(
     )  # save the memory in case something new was learned on training data

Esempio n. 3

Mostra file

File: filter.py Progetto: unacau/bayesian-spam-filtering

    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')

Esempio n. 4

Mostra file

    def test(self, path):
        '''This function determines which emails
        contain at least one "bad word", those are
        marked as spams, the others as ham, then the
        classification is written to !prediction.txt file.'''

        c = Corpus(path)
        for fname, body in c.emails():
            SPAM = False
            body = body.translate(str.maketrans('.', ' '))
            for word in self.bad_words:
                if word in body:
                    SPAM = True
                    break
            if SPAM:
                self.final_dict[fname] = 'SPAM'

            else:
                self.final_dict[fname] = 'OK'

        names = os.listdir(path)
        path0 = os.getcwd()
        os.chdir(path)
        fd = open('!prediction.txt', 'w', encoding='utf-8')
        for name in names:
            if name[0] != '!':
                fd.write(name + ' ' + self.final_dict[name] + '\n')
        fd.close()
        os.chdir(path0)
        pass

Esempio n. 5

Mostra file

File: quality.py Progetto: Scytheroid/spam-filter

def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score

Esempio n. 6

Mostra file

    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [
                list(i) for i in set(map(tuple, spamicity_list))
            ]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(
                spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')

Esempio n. 7

Mostra file

    def train(
        self,
        file_path,
        batch_size=10,
        learning_rate=0.1,
        lr_decay=0.05,
        epochs=1000,
        momentum=0.0,
        tuning=False
    ):  # analogous to PLR_filter, tuning parameter plots out a graph of mean loss over epochs
        if tuning:
            og_lr = learning_rate  # original learning rate
            x_plt = []  # x-axis (epochs)
            y_plt = []  # y-axis (mean loss)
        corpus = Corpus(file_path)
        truth_dict = utils.read_classification_from_file(file_path +
                                                         "/!truth.txt")
        got_data = True
        mails_getter = corpus.emails()
        batches = []
        while got_data:
            batch = []
            for i in range(batch_size):
                try:
                    email = next(mails_getter)
                    batch.append(
                        (email[1],
                         1 if truth_dict[email[0]] == self.pos_tag else 0))
                except StopIteration:
                    got_data = False
                    break
            batches.append(batch)

        for e in range(epochs):
            if tuning:
                steps = 0
            print(learning_rate)
            self.init_momentums()
            loss = 0
            for batch in batches:
                batch_vectors = [(m[0].get_feature_vector_lr()) for m in batch]
                y = [m[1] for m in batch]
                loss += self.gradient_descent(y, batch_vectors, learning_rate,
                                              momentum)
                if tuning:
                    steps += 1
                print(f"trained on epoch #{e +1}")
            learning_rate *= 1 / (1 + lr_decay * e)
            if tuning:
                y_plt.append(loss / steps)
                x_plt.append(e)
        if tuning:
            plt.plot(x_plt, y_plt)
            plt.title(
                f"lr:{og_lr} lrd:{lr_decay} bs:{batch_size} m: {momentum} e:{epochs}"
            )
            plt.xlabel("epochs")
            plt.ylabel("mean loss")
            plt.show()

Esempio n. 8

Mostra file

 def test(self, mails_path):
     try:
         os.remove(mails_path + "/!prediction.txt")
     except:
         pass
     corpus = Corpus(mails_path)
     with open(mails_path + "/!prediction.txt", 'a', encoding='utf-8') as f:
         for mail in corpus.emails():
             res = self.evaluate_mail(mail[1])
             f.write(f"{mail[0]} {self.pos_tag if res else self.neg_tag}\n")

Esempio n. 9

Mostra file

File: trainingcorpus.py Progetto: skalahonza/SpamFilter

 def __init__(self, folder):
     self.folder = folder
     self.spams = []
     self.hams = []
     corp = Corpus(folder)
     for fname, content in corp.emails():
         if self.is_ham(fname):
             self.hams.append(Email(fname, content))
         else:
             self.spams.append(Email(fname, content))

Esempio n. 10

Mostra file

File: test_corpus.py Progetto: EugeneEugene/SpamFilter

 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')

Esempio n. 11

Mostra file

File: filter.py Progetto: EugeneEugene/SpamFilter

    def test(self, test_corpus_dir):
        test_corpus = Corpus(test_corpus_dir)
        with open(os.path.join(test_corpus_dir, "!prediction.txt"), "w+") as a_file:
            for filename, body in test_corpus.emails():
                if self.bayesian_combination(body) > 0.9 or self.get_email_adress(body) in self.black_list:
                    decision = "SPAM"

                else:
                    if self.get_email_adress(body) in self.white_list:
                        decision = "OK"
                    else:
                        decision = "OK"
                a_file.write(filename + " " + decision + "\n")

Esempio n. 12

Mostra file

File: filter.py Progetto: pospisil98/CTUcodes

    def test(self, test_corpus_dir):
        '''
        Creates dict of classification and writes it to the file
        :param test_corpus_dir: path to test dir
        :return: None
        '''

        # Prepare "global" variables
        c = Corpus(test_corpus_dir)
        class_dict = {}

        # Iterate over emails with generator in Corpus
        for email in c.emails():
            # Declare probabilities - will be modified
            spam_probability = 0
            ham_probability = 0

            # Get word statistics of email - word frequency and word count
            word_stats = self.get_word_count_for_mail(email[1])
            word_freq = word_stats[0]
            word_count = word_stats[1]

            # Compute spamines of words
            spaminesses = []
            for word in word_freq:
                s = self.get_spaminnes_of_word(word)
                if s is not None:
                    spaminesses.append(s)

            # Caluclates needed parts for further computation
            product = self.prod(spaminesses)
            one_without_spammineses = self.one_without_spaminesses(spaminesses)

            lower = product + one_without_spammineses

            # We cannot divide by zero
            if lower != 0:
                overall_spaminess = product / (product +
                                               one_without_spammineses)
            else:
                overall_spaminess = 0

            # Final decision
            if overall_spaminess >= 0.5:
                class_dict.update({email[0]: "SPAM"})
            else:
                class_dict.update({email[0]: "OK"})

        # Creates !prediction.txt file
        utils.write_classification_to_file(
            test_corpus_dir + "/!prediction.txt", class_dict)

Esempio n. 13

Mostra file

File: test_corpus.py Progetto: skalahonza/SpamFilter

 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')

Esempio n. 14

Mostra file

File: test_corpus.py Progetto: EugeneEugene/SpamFilter

 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(
         fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR)     
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')

Esempio n. 15

Mostra file

 def train(self,
           file_path,
           batch_size=10,
           learning_rate=0.1,
           lr_decay=0.05,
           epochs=1000,
           momentum=0.0):
     corpus = Corpus(file_path)
     truth_dict = utils.read_classification_from_file(file_path +
                                                      "/!truth.txt")
     got_data = True
     mails_getter = corpus.emails()
     batches = []
     # loads all data from directory in batches of given size
     while got_data:
         batch = []
         # loads a batch of given size, a smaller one if out of data
         for i in range(batch_size):
             try:
                 email = next(mails_getter)
                 batch.append(
                     (email[1],
                      1 if truth_dict[email[0]] == self.pos_tag else 0))
             except StopIteration:
                 got_data = False
                 break
         batches.append(batch)
     for e in range(epochs):  # trains multiple times on all batches
         self.init_momentums()
         for batch in batches:  # performs gradient descent on each bach
             # gets feature vectors for batch
             feature_vectors = [
                 (m[0].get_feature_vector_plr()) for m in batch
             ]  # gets feature vectors of the batch
             y = [m[1] for m in batch]  # gets the truth vector of the batch
             for i in range(
                     self.subvector_count
             ):  # weights for each subvector are trained separately
                 subvector_batch = [
                     v[i] for v in feature_vectors
                 ]  # isolates a subvector from all vectors
                 self.gradient_descent(i, y, subvector_batch, learning_rate,
                                       momentum)
             print(f"trained on epoch #{e +1}")
         learning_rate *= 1 / (1 + lr_decay * e)

Esempio n. 16

Mostra file

File: test_corpus.py Progetto: skalahonza/SpamFilter

 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(fname=SPECIAL_FILENAME,
                             contents='fake',
                             dirname=CORPUS_DIR)
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')

Esempio n. 17

Mostra file

    def train(self, file_path):
        self.content_spam_dict = {}
        self.content_ham_dict = {}
        class_dict = utils.read_classification_from_file(file_path +
                                                         '/!truth.txt')
        corpus = Corpus(file_path)
        email_generator = corpus.emails()
        content_counter_spam = Counter()
        content_counter_ham = Counter()
        content_wordcount_spam = 0
        content_wordcount_ham = 0

        spam_count = 0
        ham_count = 0
        every_word_content = set()

        for mail in email_generator:
            content_words = self.string_to_words(mail[1].content_no_html)
            content_counter = Counter(content_words)
            for word in content_words:
                every_word_content.add(word)

            if class_dict[mail[0]] == self.pos_tag:
                spam_count += 1
                content_counter_spam += content_counter
                content_wordcount_spam += len(content_words)
            else:
                ham_count += 1
                content_counter_ham += content_counter
                content_wordcount_ham += len(content_words)

        for word in every_word_content:
            content_counter_ham[word] += 1
            content_counter_spam[word] += 1
            self.content_spam_dict[
                word] = content_counter_spam[word] / content_wordcount_spam
            self.content_ham_dict[
                word] = content_counter_ham[word] / content_wordcount_ham

        self.spam_probability = spam_count / (spam_count + ham_count)
        self.ham_probability = ham_count / (spam_count + ham_count)
        self.trained = True

Esempio n. 18

Mostra file

 def test(self, path):
     """
     Tests given emails for being SPAM or HAM
     :param path: directory with emails
     """
     emails = Corpus(path)
     with open(path + "/!prediction.txt", 'w', encoding="utf-8") as f:
         self.alpha = self.calculate_alpha(emails)
         for filename, message in emails.emails():
             words = raw_email_to_list_of_words(message)
             spam_probability, spam_probability_overflow = self.calculate_email_probability(
                 words, True)
             ham_probability, ham_probability_overflow = self.calculate_email_probability(
                 words, False)
             if decision(spam_probability_overflow, spam_probability,
                         ham_probability_overflow, ham_probability):
                 f.write(filename + " SPAM\n")
                 self.pred_dict[filename] = "SPAM"
             else:
                 f.write(filename + " OK\n")
                 self.pred_dict[filename] = "OK"

Esempio n. 19

Mostra file

File: filter.py Progetto: Scytheroid/spam-filter

    def test(self, dir_path):
        no_tests_done = 0
        rather_positive = 0
        corpus = Corpus(dir_path)
        clasif = dict()

        for name, mail in corpus.emails():
            # Test strong filters
            result = self.test_strong_filters(name, mail)
            if result != -1:  # Strong filters were decisive
                clasif[name] = result
                continue  # Skip to the next iteration

            score = 0
            tests_done = 0

            # Test normal filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            # Test word filters
            result = self.test_word_filters(name, mail)
            score += result[0]
            tests_done += result[1]

            if tests_done == 0:
                no_tests_done += 1
                # print("No tests were done for " + name)
                clasif[name] = NEGATIVE
            elif score / tests_done > POSITIVITY_THRESHOLD:
                clasif[name] = POSITIVE
            else:
                if score / tests_done > 0.50:
                    rather_positive += 1
                clasif[name] = NEGATIVE

        utils.write_classification_to_file(clasif,
                                           dir_path + "/!prediction.txt")

Esempio n. 20

Mostra file

File: script_start.py Progetto: EugeneEugene/SpamFilter

def set_truth(path):
    f = open(os.path.join(path, "!truth.txt"), 'wt')
    a = Corpus('/Users/eygene/Desktop/spam-data-12-s75-h25/3')
    for name, body in a.emails():
        f.write(name + ' ' + 'SPAM' + '\n')