Python TrainingCorpusの例、trainingcorpus.TrainingCorpus Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_trainingcorpus.py プロジェクト: palomikula/CTU-projects

 def setUp(self):
     """Prepare fake corpus with !truth.txt file."""
     self.email_dict = create_corpus_dictionary()
     self.true_class = create_classification_for(self.email_dict.keys())
     create_corpus_dir_from_dictionary(self.email_dict)
     truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
     save_classification_to_file(self.true_class, fname=truth_filepath)
     with replaced_open():
         self.tc = TrainingCorpus(CORPUS_DIR)

コード例 #2

0

ファイルを表示

    def train(self, path):
        '''this function creates list of strings
        (bad_words) that are most likely to trigger
        a spam in the test dataset of emails'''

        #these constants worked the best
        MOST_COMMON_S = 600
        MOST_COMMON_H = 5000
        CHECKED_WORD_LEN = 12
        FACTOR = 20

        words = []
        ham_string = ''
        '''this part concentrares all hams to one
        huge string, than creates list of all specific
        words in hams, and dictionary of those words
        with frequency of their appearance'''
        tc = TrainingCorpus(path)
        for fname, body in tc.hams():
            ham_string += body
            body = body.translate(str.maketrans('.', ' '))
            words = words + (body.lower().split(' '))

        counter_ham = Counter(words)
        ham_words_dict = dict(counter_ham.most_common(MOST_COMMON_H))
        ham_words_list = list(ham_words_dict.keys())

        words = []
        '''same for spams except the huge string part'''
        for fname, body in tc.spams():
            body = body.translate(str.maketrans('.', ' '))
            words = words + (body.lower().split(' '))

        couter_spam = Counter(words)
        spam_words_dict = dict(couter_spam.most_common(MOST_COMMON_S))
        spam_words_list = list(spam_words_dict.keys())
        '''this part creates the bad_words list'''
        for word in spam_words_list:

            if word not in ham_string:
                self.bad_words.append(word)

            elif len(
                    word
            ) > CHECKED_WORD_LEN and word in ham_words_list and word in spam_words_list:
                if (spam_words_dict[word] > (ham_words_dict[word] * FACTOR)):
                    self.bad_words.append(word)

        pass

コード例 #3

0

ファイルを表示

ファイル: quality.py プロジェクト: Scytheroid/spam-filter

def test_atom_filter(initialized_filter, train_dir, test_dir):
    train_corp = TrainingCorpus(train_dir)
    test_corp = Corpus(test_dir)

    filter = initialized_filter
    filter.train(train_corp)
    prediction = dict()

    for name, mail in test_corp.emails():
        result = filter.test(mail)
        if result == -1:
            continue
        elif result > POSITIVITY_THRESHOLD:
            prediction[name] = POSITIVE
        else:
            prediction[name] = NEGATIVE

    truth = read_classification_from_file(test_dir + '/' + TRUTHFILE)
    conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE)
    conf_matrix.compute_from_dicts(truth, prediction)

    matrix_dict = conf_matrix.as_dict()
    # For testing purposes
    print(matrix_dict)

    score = quality_score(matrix_dict['tp'], \
                          matrix_dict['tn'], \
                          matrix_dict['fp'], \
                          matrix_dict['fn'])
    return score

コード例 #4

0

ファイルを表示

ファイル: test_trainingcorpus.py プロジェクト: EugeneEugene/SpamFilter

 def setUp(self):
     """Prepare fake corpus with !truth.txt file."""
     self.email_dict = create_corpus_dictionary()
     self.true_class = create_classification_for(self.email_dict.keys())
     create_corpus_dir_from_dictionary(self.email_dict)
     truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
     save_classification_to_file(self.true_class, fname=truth_filepath)
     self.tc = TrainingCorpus(CORPUS_DIR)

コード例 #5

0

ファイルを表示

ファイル: filter.py プロジェクト: pospisil98/CTUcodes

    def train(self, train_corpus_dir):
        '''
        Trains my silly filter
        :param train_corpus_dir: path to train dir
        :return: None
        '''

        tc = TrainingCorpus(train_corpus_dir)
        tc.return_spam_ham_count()

        # Get word frequencies
        spam = tc.get_spam_word_count_dict_and_avg()
        ham = tc.get_ham_word_count_dict_and_avg()
        self.spam_word_count_dict = spam[0]
        self.ham_word_count_dict = ham[0]

        self.spam_word_count_avg = spam[1]
        self.ham_word_count_avg = ham[1]

        # Remove intersection of them from them
        # intersection = self.spam_word_count_dict & self.ham_word_count_dict
        # self.spam_word_count_dict -= intersection
        # self.ham_word_count_dict -= intersection

        # Totally number of spam and ham counts from test suite
        counts = tc.return_spam_ham_count()
        self.spam_count = counts[0]
        self.ham_count = counts[1]

コード例 #6

0

ファイルを表示

ファイル: filter.py プロジェクト: MikkCZ/OI-BC

    def train(self, corpus_dir):
        '''Train method the filter uses to teach according to !truth.txt file.'''
        self.train_corpus_dir = corpus_dir
        # test if the !truth.txt exists, else exit (no error raised)
        truth_file = os.path.join(corpus_dir, '!truth.txt')
        if os.path.exists(truth_file)==False:
            self.trained = False
            return
        
        # create TrainingCorpus object for better training handling
        Corpus = TrainingCorpus(corpus_dir)

        # get spams/hams senders, return_paths and subjects
        for fname in Corpus.truth_dict:
            (sender, subject) = Corpus.parse_email(fname)
            if Corpus.truth_dict[fname] == self.neg_tag:
                self.save_ham_header(sender, subject)
            else:
                self.save_spam_header(sender, subject)

コード例 #7

0

ファイルを表示

 def train(self, email_adress):
     global all_words, spam_words, probability_spam, count_spams, count_emails
     hemails_with_body = TrainingCorpus(email_adress).hams()
     semails_with_body = TrainingCorpus(email_adress).spams()
     hwords = TrainingCorpus(email_adress).get_words(hemails_with_body)
     swords = TrainingCorpus(email_adress).get_words(semails_with_body)
     all_words = TrainingCorpus(email_adress).all_words(hwords, swords) # all words with their count
     spam_words = TrainingCorpus(email_adress).spam_words(swords) # spam words with their count
     count_spams = TrainingCorpus.count_spams(email_adress) # count of all spam's emails
     count_emails = TrainingCorpus.count_emails(email_adress) # count of all emails
     probability_spam = count_spams / count_emails # probability that email is spam
     pass

コード例 #8

0

ファイルを表示

    def train(self, path):
        """
        Trains the corpus on given emails dataset
        :param path: directory with emails
        """
        self.truth_dict = read_classification_from_file(path + "/!truth.txt")
        emails = TrainingCorpus(path)

        spam_words, num_of_spam_emails = self.list_spam_ham_words(emails, True)
        ham_words, num_of_ham_emails = self.list_spam_ham_words(emails, False)

        self.portion_of_spam_emails = num_of_spam_emails / (
            num_of_spam_emails + num_of_ham_emails)
        self.all_words = Counter(join_spam_and_ham_words(
            spam_words, ham_words))

        self.num_of_spam_words = len(spam_words)
        self.num_of_ham_words = len(ham_words)
        self.num_of_all_words = len(self.all_words)

コード例 #9

0

ファイルを表示

 def test(self, email_adress):
     global all_words, spam_words, probability_spam, count_spams, count_emails
     # part without train
     if probability_spam == 0:
         html_words = ['<html>', '<p>', '</a>', '<br>', '<head>', '<meta>', '<title>', '<body>']
         fnames_with_body = Corpus(email_adress).emails()
         f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8")
         for fname in fnames_with_body:
             for word in html_words:
                 if word in fname[1]: # if word there are in email's body -> It's SPAM!
                     f.write(str(fname[0] + ' SPAM\n'))
                     break
                 f.write(str(fname[0] + ' OK\n')) # Else it's probably ham =\
         f.close()
     # part with train
     else:
         fnames_with_body = Corpus(email_adress).emails()
         f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8")
         for fname in fnames_with_body:
             email_words = TrainingCorpus.get_words_from_email(fname[1])
             probability_spam_words = []
             for word in email_words:
                 # skip empty words and about know nothing
                 if (word not in all_words) or (word == ''):
                     continue
                 if word not in spam_words:
                     probability_spam_word = 0
                 if word in spam_words:
                     # Bayes' theorem. What is the probability that email is spam, if it has this word
                     probability_spam_word = ( spam_words[word]/count_spams * probability_spam) / (all_words[word]/count_emails)
                 probability_spam_words.append(probability_spam_word)
             # Final probability that email is spam
             probability_spam_email = sum(probability_spam_words)/len(probability_spam_words) *100
             if probability_spam_email > 70:
                 f.write(str(fname[0] + ' SPAM\n'))
             else:
                 f.write(str(fname[0] + ' OK\n'))
         f.close()

コード例 #10

0

ファイルを表示

ファイル: filter.py プロジェクト: Scytheroid/spam-filter

 def train(self, dir_path):
     corpus = TrainingCorpus(dir_path)
     for filt in self.strong_filters + self.normal_filters + self.word_filters:
         # print("Training " + filt.__class__.__name__)
         filt.train(corpus)

コード例 #11

0

ファイルを表示

ファイル: test_trainingcorpus.py プロジェクト: palomikula/CTU-projects

class TrainingCorpusTest(unittest.TestCase):
    def setUp(self):
        """Prepare fake corpus with !truth.txt file."""
        self.email_dict = create_corpus_dictionary()
        self.true_class = create_classification_for(self.email_dict.keys())
        create_corpus_dir_from_dictionary(self.email_dict)
        truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
        save_classification_to_file(self.true_class, fname=truth_filepath)
        with replaced_open():
            self.tc = TrainingCorpus(CORPUS_DIR)

    def tearDown(self):
        delete_corpus_directory()

    def test_getClass(self):
        """Test the get_class method."""
        for key, exp_class in self.true_class.items():
            with replaced_open():
                obs_class = self.tc.get_class(key)
            self.assertEqual(
                exp_class, obs_class,
                'The expected class of email {} is {}, but {} was observed'.
                format(key, exp_class, obs_class))

    def test_isSpam(self):
        """Test the is_spam method."""
        for key, exp_class in self.true_class.items():
            exp_spam = (exp_class == SPAM_TAG)
            with replaced_open():
                obs_spam = self.tc.is_spam(key)
            self.assertEqual(
                exp_spam, obs_spam,
                'The email {} spamminess: expected {}, observed {}.'.format(
                    key, str(exp_spam), str(obs_spam)))

    def test_isHam(self):
        """Test the is_ham method."""
        for key, exp_class in self.true_class.items():
            exp_ham = (exp_class == HAM_TAG)
            with replaced_open():
                obs_ham = self.tc.is_ham(key)
            self.assertEqual(
                exp_ham, obs_ham,
                'The email {} hamminess: expected {}, observed {}.'.format(
                    key, str(exp_ham), str(obs_ham)))

    def test_spams(self):
        """Test spams() method."""
        obs_num_spams = 0
        with replaced_open():
            for fname, contents in self.tc.spams():
                obs_num_spams += 1
                # Validate results
                self.assertEqual(self.true_class[fname], SPAM_TAG,
                                 'Non-spam email returned by spams() method.')
                self.assertEqual(
                    self.email_dict[fname], contents,
                    'The read file contents are not equal to the expected contents.'
                )
        c = Counter(self.true_class.values())
        exp_num_spams = c[SPAM_TAG]
        self.assertEqual(
            exp_num_spams, obs_num_spams,
            'The spams() method did not return the right number of spams.')

    def test_hams(self):
        """Test hams() method."""
        obs_num_hams = 0
        with replaced_open():
            for fname, contents in self.tc.hams():
                obs_num_hams += 1
                # Validate results
                self.assertEqual(self.true_class[fname], HAM_TAG,
                                 'Spam email returned by hams() method.')
                self.assertEqual(
                    self.email_dict[fname], contents,
                    'The read file contents are not equal to the expected contents.'
                )
        c = Counter(self.true_class.values())
        exp_num_hams = c[HAM_TAG]
        self.assertEqual(
            exp_num_hams, obs_num_hams,
            'The hams() method did not return the right number of hams.')

コード例 #12

0

ファイルを表示

ファイル: test_trainingcorpus.py プロジェクト: EugeneEugene/SpamFilter

class TrainingCorpusTest(unittest.TestCase):
    def setUp(self):
        """Prepare fake corpus with !truth.txt file."""
        self.email_dict = create_corpus_dictionary()
        self.true_class = create_classification_for(self.email_dict.keys())
        create_corpus_dir_from_dictionary(self.email_dict)
        truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
        save_classification_to_file(self.true_class, fname=truth_filepath)
        self.tc = TrainingCorpus(CORPUS_DIR)

    def tearDown(self):
        delete_corpus_directory()

    def test_getClass(self):
        """Test the get_class method."""
        for key, exp_class in self.true_class.items():
            obs_class = self.tc.get_class(key)
            self.assertEqual(
                exp_class,
                obs_class,
                "The expected class of email {} is {}, but {} was observed".format(key, exp_class, obs_class),
            )

    def test_isSpam(self):
        """Test the is_spam method."""
        for key, exp_class in self.true_class.items():
            exp_spam = exp_class == SPAM_TAG
            obs_spam = self.tc.is_spam(key)
            self.assertEqual(
                exp_spam,
                obs_spam,
                "The email {} spamminess: expected {}, observed {}.".format(key, str(exp_spam), str(obs_spam)),
            )

    def test_isHam(self):
        """Test the is_ham method."""
        for key, exp_class in self.true_class.items():
            exp_ham = exp_class == HAM_TAG
            obs_ham = self.tc.is_ham(key)
            self.assertEqual(
                exp_ham,
                obs_ham,
                "The email {} hamminess: expected {}, observed {}.".format(key, str(exp_ham), str(obs_ham)),
            )

    def test_spams(self):
        """Test spams() method."""
        obs_num_spams = 0
        for fname, contents in self.tc.spams():
            obs_num_spams += 1
            # Validate results
            self.assertEqual(self.true_class[fname], SPAM_TAG, "Non-spam email returned by spams() method.")
            self.assertEqual(
                self.email_dict[fname], contents, "The read file contents are not equal to the expected contents."
            )
        c = Counter(self.true_class.values())
        exp_num_spams = c[SPAM_TAG]
        self.assertEqual(exp_num_spams, obs_num_spams, "The spams() method did not return the right number of spams.")

    def test_hams(self):
        """Test hams() method."""
        obs_num_hams = 0
        for fname, contents in self.tc.hams():
            obs_num_hams += 1
            # Validate results
            self.assertEqual(self.true_class[fname], HAM_TAG, "Spam email returned by hams() method.")
            self.assertEqual(
                self.email_dict[fname], contents, "The read file contents are not equal to the expected contents."
            )
        c = Counter(self.true_class.values())
        exp_num_hams = c[HAM_TAG]
        self.assertEqual(exp_num_hams, obs_num_hams, "The hams() method did not return the right number of hams.")

コード例 #13

0

ファイルを表示

ファイル: filter.py プロジェクト: skalahonza/SpamFilter

 def train(self, directory):
     trainer = TrainingCorpus(directory)
     self.spams = trainer.spams
     self.hams = trainer.hams
     self.trained = True

コード例 #14

0

ファイルを表示

from basefilter import WordFilter
from trainingcorpus import TrainingCorpus
import inspect
import wordfilters

c = TrainingCorpus('./1')
for name, obj in inspect.getmembers(wordfilters):
    if inspect.isclass(obj):
        if obj.__module__ == "wordfilters":
            a = obj()
            a.train(c)
            print(name, a.bayes_val)