Python Corpus.Corpusの例、corpus.Corpus.Corpus Pythonの例

コード例 #1

0

ファイルを表示

ファイル: spam_filter.py プロジェクト: Darker/RPH

 def test(self, directory):
     c = Corpus(directory)
     with open(path.join(directory, "!prediction.txt"),
               'w',
               encoding="utf-8") as output:
         for (filename, wordlist) in c.wordlists():
             prob_spam = self.classifier.prob_list(wordlist)
             is_spam = prob_spam >= self.analyser.p_treshold
             tag = "SPAM" if is_spam else "OK"
             output.write(filename + " " + tag + "\n")

コード例 #2

0

ファイルを表示

ファイル: max_ent_trainer.py プロジェクト: reidoja/cis510_nlp

def label_chunk_file(file_to_label: Path, output_file: Path):
    r""" Perform NER on file \p file_to_label and write to \p output_file """
    label_corpus = Corpus(file_to_label)
    label_corpus.fit_features()

    data_path = file_to_label.with_suffix(".dat_chunk")
    label_corpus.export(data_path)

    output_file.parent.mkdir(parents=True, exist_ok=True)
    label_with_maxent(data_path=data_path, model_path=MODEL_PATH, output_file=output_file)

コード例 #3

0

ファイルを表示

 def test(self, mails_path):
     try:
         os.remove(mails_path + "/!prediction.txt")
     except:
         pass
     corpus = Corpus(mails_path)
     with open(mails_path + "/!prediction.txt", 'a', encoding='utf-8') as f:
         for mail in corpus.emails():
             res = self.evaluate_mail(mail[1])
             f.write(f"{mail[0]} {self.pos_tag if res else self.neg_tag}\n")

コード例 #4

0

ファイルを表示

def resume_segmentation(iterations=10):
    logger.debug('Resume segmentation')
    corpus = Corpus(Q=opt.gmm, subaction=opt.subaction)

    for iteration in range(iterations):
        logger.debug('Iteration %d' % iteration)
        corpus.iter = iteration
        corpus.resume_segmentation()
        corpus.accuracy_corpus()
    corpus.accuracy_corpus()

コード例 #5

0

ファイルを表示

ファイル: max_ent_trainer.py プロジェクト: reidoja/cis510_nlp

def build_model(train_path: Path):
    r""" Construct the learner model """
    train_corpus = Corpus(train_path)
    train_corpus.fit_features()
    data_path = train_path.with_suffix(".dat_name")
    train_corpus.export(data_path)

    if MODEL_PATH.exists(): MODEL_PATH.unlink()  # Delete the existing model
    MODEL_PATH.parent.mkdir(exist_ok=True, parents=True)
    train_maxent_model(data_path=data_path, model_path=MODEL_PATH)

コード例 #6

0

ファイルを表示

ファイル: trainingcorpus.py プロジェクト: skalahonza/SpamFilter

 def __init__(self, folder):
     self.folder = folder
     self.spams = []
     self.hams = []
     corp = Corpus(folder)
     for fname, content in corp.emails():
         if self.is_ham(fname):
             self.hams.append(Email(fname, content))
         else:
             self.spams.append(Email(fname, content))

コード例 #7

0

ファイルを表示

def load_quora():
    QUORA_PATH = '/home/mgimenez/Dev/corpora/Quora/quora_duplicate_questions.tsv'
    dataset = Corpus('quora', QUORA_PATH)

    train_non_sim, train_sim, dev_non_sim, dev_sim, \
    test_non_sim, test_sim, \
    vocab_processor, seq_len = dataset.make_partitions_quora()

    return train_non_sim, train_sim, dev_non_sim, dev_sim, \
           test_non_sim, test_sim, vocab_processor, seq_len

コード例 #8

0

ファイルを表示

def test_tok1_interro():
    """
    Check that indexes can be shown
    """
    corpus = Corpus(tok_path)
    res = corpus.interrogate(show=['s', 'i', 'l'])
    sortd = res.results[sorted(res.results.columns)]
    three = ['0/0/corpus', '0/0/this', '0/1/linguistics']
    assert_equals(list(sortd.columns)[:3], three)
    assert_equals(sortd.sum().sum(), 77)

コード例 #9

0

ファイルを表示

 def __init__(self, frontier):
     self.frontier = frontier
     self.corpus = Corpus()
     self.most_links = (None,0) #keeps track of page with most valid out links
     self.traps = set()
     
     ##next 3 attributes are used to compare consecutive links in regards to trap detection
     self.old_link = None
     self.old_path = None
     self.old_query = None

コード例 #10

0

ファイルを表示

ファイル: main.py プロジェクト: colinsongf/MOQA

def main():

    QAfile = sys.argv[1]
    ReviewFile = sys.argv[2]
    minReview = int(sys.argv[3])
    k = int(sys.argv[4])
    numiter = int(sys.argv[5])
    Lambda = float(sys.argv[6])
    predictionsOut = sys.argv[7]
    rankingOut = sys.argv[8]

    corpus = Corpus(QAfile, ReviewFile, minReview)

    corpus.construct_QAnswersAndQPerItem()
    corpus.construct_SentencesAndSPerItem()
    corpus.Calculate_PairWiseFeature()

    print "Vocabulary Size: " + str(corpus.Map.V)
    print "Number of Questions: " + str(len(corpus.QAnswers))
    print "Number of Reviews: " + str(len(corpus.Sentences))
    print "Number of Items " + str(len(corpus.Map.ItemIDMap))
    print "Avg review length " + str(
        sum(corpus.Avgdl.values()) / len(corpus.Avgdl))

    model = Model(k, numiter, Lambda, corpus)

    sess = model.train_model()

    print "\nModel is trained and optimal model loaded!\n"

    valid_accuracy, test_accuracy, topRanked = model.valid_test_perf(sess)

    if (predictionsOut):
        model.save_predictions(topRanked, predictionsOut)

    if (rankingOut):
        topRanked = model.top_ranked(sess, 10)
        model.save_top_ranked(topRanked, rankingOut)

    print "Predictions are saved\n"

    valid_AUC, test_AUC = model.AUC(sess)

    print "-----------------------------------------------"
    print "----------------------------------------------\n"
    print "Accuracy: "
    print "\tValidation: " + str(valid_accuracy)
    print "\tTest: " + str(test_accuracy)
    print "\n"
    print "AUC: "
    print "\tValidation: " + str(valid_AUC)
    print "\tTest: " + str(test_AUC)
    print "\n"
    print "-----------------------------------------------"
    print "----------------------------------------------\n"

コード例 #11

0

ファイルを表示

    def parse(self,
              corenlppath=False,
              operations=False,
              copula_head=True,
              speaker_segmentation=False,
              memory_mb=False,
              *args,
              **kwargs):
        """
        Parse an unparsed corpus, saving to disk

        :param corenlppath: folder containing corenlp jar files
        :type corenlppath: str
                
        :param operations: which kinds of annotations to do
        :type operations: str
        
        :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
        :type speaker_segmentation: bool

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: int

        :param copula_head: Make copula head in dependency parse
        :type copula_head: bool

        :Example:

        >>> parsed = corpus.parse(speaker_segmentation = True)
        >>> parsed
        <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>


        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        from make import make_corpus
        from corpus import Corpus
        #from process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        return Corpus(
            make_corpus(self.path,
                        parse=True,
                        tokenise=False,
                        corenlppath=corenlppath,
                        operations=operations,
                        copula_head=copula_head,
                        speaker_segmentation=speaker_segmentation,
                        memory_mb=memory_mb,
                        *args,
                        **kwargs))

コード例 #12

0

ファイルを表示

    def __init__(self, frontier):
        self.frontier = frontier
        self.corpus = Corpus()

        #create variables needed for analtics calculation
        self.traps = set()
        self.downloads = set()
        self.valid_links_count = 0
        self.url_with_most_valid_links = ''
        self.current_highest_valid_link_count = -1
        self.subdomains_and_frequencies = defaultdict(int)

コード例 #13

0

ファイルを表示

 def __init__(self, frontier):
     self.frontier = frontier
     self.corpus = Corpus()
     self.mostValidOut = 0
     self.mostValidOutURL = ""
     self.listTraps = []
     self.subdomainCount = {}
     self.prev_query = []
     self.param = ""
     self.count = 0
     self.urldeque = deque(maxlen=10)

コード例 #14

0

ファイルを表示

def load_ibm():
    """ Load the train and dev datasets """
    IBM_PATH = '/home/mgimenez/Dev/corpora/Quora/IBM'
    TRAIN_PATH = join(IBM_PATH, 'train.tsv')
    train = Corpus('ibm', TRAIN_PATH)
    DEV_PATH = join(IBM_PATH, 'dev.tsv')
    dev = Corpus('ibm', DEV_PATH)
    TEST_PATH = join(IBM_PATH, 'test.tsv')
    test = Corpus('ibm', TEST_PATH)

    vocab_processor, seq_len = build_vocabulary(train.sim_data,
                                                train.non_sim_data)
    train.to_index(vocab_processor)
    dev.to_index(vocab_processor)
    test.to_index(vocab_processor)

    return train.non_sim_data, train.sim_data, \
           dev.non_sim_data, dev.sim_data, \
           test.sim_data, test.non_sim_data, \
           vocab_processor, seq_len

コード例 #15

0

ファイルを表示

def generateData():
    rep = Representor(None, 'citybeat',
                      'next_week_candidate_event_25by25_merged')
    corpus = Corpus()
    corpus.buildCorpusOnDB('citybeat',
                           'next_week_candidate_event_25by25_merged')
    true_event_list, false_event_list = loadNextWeekData()
    EventFeatureTwitter(None).GenerateArffFileHeader()

    for event in true_event_list + false_event_list:
        EventFeatureTwitter(event, corpus, rep).printFeatures()

コード例 #16

0

ファイルを表示

ファイル: corpus_reader.py プロジェクト: BhargavNagandla/atp

    def __init__(self, path):

        parsemeCorpus = Corpus(path)
        self.train_sents = None
        self.dev_sents = None
        self.test_sents = None
        if parsemeCorpus.sentences:
            self.train_sents = parsemeCorpus.sentences
        if parsemeCorpus.devSents:
            self.dev_sents = parsemeCorpus.devSents
        if parsemeCorpus.testSents:
            self.test_sents = parsemeCorpus.testSents

コード例 #17

0

ファイルを表示

    def next(self):
        urls = []

        searchURL = GoogleNewsURLCrawler._getNextSearchURL(self.queryExpression, self.pageCount, self.pageSizeToRetreive)
        self.pageCount += self.pageSizeToRetreive

        GoogleNewsURLCrawler.chromeDriver.get(searchURL)
        searchResults = GoogleNewsURLCrawler.chromeDriver.find_elements_by_class_name('_hJs')

        for searchResult in searchResults:
            try:
               link = searchResult.find_element_by_tag_name('h3').find_element_by_tag_name('a').get_attribute('href')
               date = searchResult.find_element_by_class_name('slp').find_element_by_class_name('_QHs').get_attribute('innerText')
               if date.find('ago') != -1:
                   date = GoogleNewsURLCrawler._GetToday()

               urls.append((link, date))
            except Exception as errorMessage:
                print(errorMessage)
                continue

        corpora = []
        for url in urls:
            try:
                GoogleNewsURLCrawler.chromeDriver.get(url[0])
                terms = ''

                try:
                    for element in GoogleNewsURLCrawler.chromeDriver.find_element_by_tag_name('body').find_elements_by_xpath(".//*"):
                        if (element.tag_name != 'style') and (element.tag_name != 'script'):
                            import re
                            try:
                                innerText = element.get_attribute('innerText')
                                if innerText != None:
                                    tokens = list(filter(lambda element: (element != None) and (element != ''),
                                                         re.split(r'\s+|\t+|\n+|,|:|;|\.', innerText)))
                                    for token in tokens:
                                        terms += (token + ' ')

                            except Exception as ex:
                                print('exception: ' + ex + 'exception text: ' + innerText)
                                print('\n')
                                print('\n')
                                continue

                except Exception:
                    continue

                corpora.append(Corpus(self.keyword, url[1], url[0], terms))
            except Exception:
                continue

        return corpora

コード例 #18

0

ファイルを表示

 def __init__(self):
     """
     Initializing an instance of LDA.
     """
     # corpus of a LDA model.
     self.corpus = Corpus()
     # the distribution of topics over terms.
     self.topic_distribution_over_term = None
     # the distribution of documents over topics.
     self.document_distribution_over_topic = None
     # gibbs sampler.
     self.gibbs_sampler = None

コード例 #19

0

ファイルを表示

def main():
    args = get_args()
    corpus = Corpus("text8", args.gram_min, args.gram_max, args.part == "part")
    subword_embeddings, _ = train_fasttext(corpus,
                                           ns_num=args.ns,
                                           window_size=5,
                                           dimension=100,
                                           learning_rate=0.01,
                                           epoch=1,
                                           subsampling=True)
    find_similar_words(corpus, subword_embeddings, args.gram_min,
                       args.gram_max)

コード例 #20

0

ファイルを表示

    def __init__(self, frontier):
        self.frontier = frontier
        self.corpus = Corpus()

        self.subdomains = {}
        self.subdomains["ics.uci.edu"] = 0
        self.downloaded_urls = []
        self.out_links = {}
        self.traps = []

        self.checked = {}
        self.threshold = 1000

コード例 #21

0

ファイルを表示

ファイル: chatbot.py プロジェクト: hanxueming126/aibot

 def train(self, dot_path):
     print("Begin train...")
     corpus = Corpus()
     corpus_data = corpus.load_corpus(dot_path)
     for data in corpus_data:
         for conversation in data: # conversation表示一个会话
             statement_history = []
             for text in conversation: # 依次从训练对话中取句子
                 if statement_history:
                     self.storage.add(statement_history[-1], text.encode('utf-8'));
                 statement_history.append(text.encode('utf-8')) # 当前语句加入历史列表
     print("End of train!")

コード例 #22

0

ファイルを表示

def main():
    # Commandline argument parsing stuff
    default_dir = 'sample-data'
    directory = argv[1] if len(argv) == 2 else default_dir
    if not all([len(argv) <= 2, os.path.isdir(directory)]):
        print """Usage : python {} <directory>
    <directory> : path to directory of annotation XMLs""".format(argv[0])
        exit()

    corpus = Corpus(directory)

    # Setup accumulators
    annotator_ids = set()
    document_ids = set()
    extents = set()
    tag_types = {'NONE'}

    # First pass over corpus to accumulate IDs/labels
    for document in corpus:
        document_id, annotator_id, phase = parse_name(document)
        document_ids.add(document_id)
        annotator_ids.add(annotator_id)
        tag_types.update(document.extent_types)
        for tag in document.consuming_tags():
            extents.add(get_extent(document_id, tag))

    # Index all the things
    annotators, labels, subjects = map(index,
                                       (annotator_ids, tag_types, extents))

    # Setup numpy array to store the data
    shape = (len(subjects), len(labels))
    data = numpy.zeros(shape, dtype=int)
    extents = dict.fromkeys(extents)

    # Second pass over the corpus to populate extents dictionary
    for document in corpus:
        document_id, annotator_id, phase = parse_name(document)
        for tag in document.consuming_tags():
            extent = get_extent(document_id, tag)
            if not extents[extent]:
                extents[extent] = ['NONE'] * len(annotators)
            annotator = annotators[annotator_id]
            extents[extent][annotator] = tag.tag

    # Final pass over the extents dictionary to populate data matrix
    for subject, annotations in extents.iteritems():
        for label in annotations:
            row, column = subjects[subject], labels[label]
            data[row, column] += 1

    print "Fleiss's kappa : {}".format(fleiss.kappa(data))

コード例 #23

0

ファイルを表示

 def train(self, train_corpus_dir):
     """
     Train the corpus on given emails dataset
     :param train_corpus_dir:
     """
     self.truth = utils.read_classification_from_file(
         train_corpus_dir)  # load truth
     train_corpus = Corpus(train_corpus_dir)
     self.get_SPAM_percentage(train_corpus)  # not in use now
     for val in email_keys:  # get values for parts of email header
         self.classify_part(train_corpus, val.lower())
     self.classify_payload(
         train_corpus)  # get values for email payload/body

コード例 #24

0

ファイルを表示

ファイル: neural_pos_tagger.py プロジェクト: fin10/NeuralTokenizer

    def evaluate(self, corpus_path):
        test_corpus = Corpus(corpus_path)
        test_set = [{
            'x': list(self.__char_processor.transform(item['text']))[0],
            'y': list(self.__tag_processor.transform(item['tag']))[0],
            'length': item['length']
        } for item in test_corpus.items()]

        result = self.__estimator.evaluate(
            input_fn=lambda: self.__input_fn(test_set), )

        print('Test: %d' % len(test_corpus))
        print(result)

コード例 #25

0

ファイルを表示

	def addChannel(self, paths):
		c = Corpus()
		for path in paths:
			print path
			with open(path) as f:
				text = f.read()
				name = path.split('/')[-1]
				d = Document(name, text)
				c.add_document(d)
		self.sourceboard.addChannel(c)
		self.refresh()
		self.frame.fSizer.Layout()
		self.frame.Fit()

コード例 #26

0

ファイルを表示

def document_check():
    """
    Check that the document lazy attribute works
    """
    corpus = Corpus(speak_path)
    df = corpus[0][0].document
    fir = ['This', 'this', 'DT', 'O', 3, 'det', '0', '1']
    assert_equals(list(df.ix[1,1], fir))
    kys = list(df._metadata[5].keys())
    lst = ['year', 'test', 'parse', 'speaker', 'num']
    assert_equals(kys, lst)
    assert_equals(corpus.files, None)
    assert_equals(corpus.datatype, 'conll')

コード例 #27

0

ファイルを表示

ファイル: rnn.py プロジェクト: tuteng/rnnlab

 def __init__(self, flavor, user_configs):
     ##########################################################################
     # define directories
     self.runs_dir = load_rnnlabrc('runs_dir')
     self.log_path = os.path.abspath(
         os.path.join(os.path.expanduser('~'), 'rnnlab_log.csv'))
     ##########################################################################
     # make configs dict
     self.configs_dict = self.make_configs_dict(user_configs, flavor)
     ##########################################################################
     # calc num_epochs
     self.num_reps = int(self.configs_dict['num_reps'])
     self.num_iterations = int(self.configs_dict['num_iterations'])
     self.num_epochs = int(self.num_reps / self.num_iterations)
     if not self.num_reps % self.num_iterations == 0:
         sys.exit(
             'rnnlab: "num_reps" must be divisible by "num_iterations"')
     print 'Num reps: {} / Num iterations: {} --> Num epochs : {}'.format(
         self.num_reps, self.num_iterations, self.num_epochs)
     ##########################################################################
     # make corpus
     corpus_kwargs = {
         key: self.configs_dict[key]
         for key in [
             'corpus_name', 'vocab_file_name', 'freq_cutoff', 'probes_name',
             'mb_size', 'bptt_steps', 'num_mbs_in_doc', 'block_order'
         ]
     }
     corpus_kwargs['num_epochs'] = self.num_epochs
     self.corpus = Corpus(**corpus_kwargs)
     ##########################################################################
     # create rnn_graph
     num_input_units = len(self.corpus.token_list)
     self.rnn_graph = create_rnn_graph(num_input_units, self.configs_dict)
     ##########################################################################
     # assign instance variables
     self.model_name = self.configs_dict['model_name']
     self.block_order = str(self.configs_dict['block_order'])
     self.n_data = int(self.configs_dict['n_data'])
     self.mb_size = int(self.configs_dict['mb_size'])
     self.num_hidden_units = int(self.configs_dict['num_hidden_units'])
     self.bptt_steps = int(self.configs_dict['bptt_steps'])
     self.num_ba_samples = int(self.configs_dict['num_ba_samples'])
     self.probes_ba_list = []
     ##########################################################################
     # calc instance variables
     self.stop_mb = self.corpus.num_train_doc_ids * self.num_reps * self.corpus.num_mbs_in_doc
     print 'Stop minibatch: {:,}'.format(self.stop_mb)
     mb_n = int(self.stop_mb / self.n_data)
     self.data_mbs = np.arange(mb_n, (mb_n * self.n_data) + mb_n, mb_n)
     print 'self.data_mbs :', self.data_mbs

コード例 #28

0

ファイルを表示

ファイル: rules.py プロジェクト: kbuschme/irony-detection

def applySingleRules(IDsFilename):
    """
    Should originally just apply one rule.
    Is now used to apply one feature to the given corpus.
    So it basically shows how often each feature occurs in ironic and regular
    reviews.
    """
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH,
                                                   file=IDsFilename))

    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    print("Loading reviews...")
    #   dataSet = Corpus.loadCorpus(filename="training_set.pk")
    # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs,
                                               dataSet.reviews)

    showFeatureOccurrence(features, featureVectors)

    gold = dataSet.goldStandard

    # decisiveFeatureNames = ["Scare quotes",
    #                         "Positive star polarity discrepancy",
    #                         "Negative star polarity discrepancy",
    #                         "Positive Ppunctuation",
    #                         "Negative Ppunctuation",
    #                         "Streak of Positive Words",
    #                         "Ellipsis and Punctuation",
    #                         "Emoticon Happy", "Emoticon Laughing",
    #                         "Emoticon Winking", "Emotion Tongue",
    #                         "LoLAcroym", "GrinAcronym", "Onomatopoeia",
    #                         "Interrobang"]

    decisiveFeatureNames = [f.name for f in features]

    for d in decisiveFeatureNames:
        classification = classify(features, featureVectors, [d])

        targets = []
        cls = []

        for ID, g in gold.items():
            targets.append(g)
            cls.append(classification[ID])

        print("\nClassifying by rule: ", d)

        showPerformance(targets, cls)

コード例 #29

0

ファイルを表示

ファイル: filter.py プロジェクト: pospisil98/CTUcodes

    def test(self, test_corpus_dir):
        '''
        Creates dict of classification and writes it to the file
        :param test_corpus_dir: path to test dir
        :return: None
        '''

        # Prepare "global" variables
        c = Corpus(test_corpus_dir)
        class_dict = {}

        # Iterate over emails with generator in Corpus
        for email in c.emails():
            # Declare probabilities - will be modified
            spam_probability = 0
            ham_probability = 0

            # Get word statistics of email - word frequency and word count
            word_stats = self.get_word_count_for_mail(email[1])
            word_freq = word_stats[0]
            word_count = word_stats[1]

            # Compute spamines of words
            spaminesses = []
            for word in word_freq:
                s = self.get_spaminnes_of_word(word)
                if s is not None:
                    spaminesses.append(s)

            # Caluclates needed parts for further computation
            product = self.prod(spaminesses)
            one_without_spammineses = self.one_without_spaminesses(spaminesses)

            lower = product + one_without_spammineses

            # We cannot divide by zero
            if lower != 0:
                overall_spaminess = product / (product +
                                               one_without_spammineses)
            else:
                overall_spaminess = 0

            # Final decision
            if overall_spaminess >= 0.5:
                class_dict.update({email[0]: "SPAM"})
            else:
                class_dict.update({email[0]: "OK"})

        # Creates !prediction.txt file
        utils.write_classification_to_file(
            test_corpus_dir + "/!prediction.txt", class_dict)

コード例 #30

0

ファイルを表示

 def test(self, email_adress):
     global all_words, spam_words, probability_spam, count_spams, count_emails
     # part without train
     if probability_spam == 0:
         html_words = ['<html>', '<p>', '</a>', '<br>', '<head>', '<meta>', '<title>', '<body>']
         fnames_with_body = Corpus(email_adress).emails()
         f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8")
         for fname in fnames_with_body:
             for word in html_words:
                 if word in fname[1]: # if word there are in email's body -> It's SPAM!
                     f.write(str(fname[0] + ' SPAM\n'))
                     break
                 f.write(str(fname[0] + ' OK\n')) # Else it's probably ham =\
         f.close()
     # part with train
     else:
         fnames_with_body = Corpus(email_adress).emails()
         f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8")
         for fname in fnames_with_body:
             email_words = TrainingCorpus.get_words_from_email(fname[1])
             probability_spam_words = []
             for word in email_words:
                 # skip empty words and about know nothing
                 if (word not in all_words) or (word == ''):
                     continue
                 if word not in spam_words:
                     probability_spam_word = 0
                 if word in spam_words:
                     # Bayes' theorem. What is the probability that email is spam, if it has this word
                     probability_spam_word = ( spam_words[word]/count_spams * probability_spam) / (all_words[word]/count_emails)
                 probability_spam_words.append(probability_spam_word)
             # Final probability that email is spam
             probability_spam_email = sum(probability_spam_words)/len(probability_spam_words) *100
             if probability_spam_email > 70:
                 f.write(str(fname[0] + ' SPAM\n'))
             else:
                 f.write(str(fname[0] + ' OK\n'))
         f.close()