コード例 #1
0
    def test_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
            model.build_vocab(corpus_file=corpus_file)
            self.model_sanity(model)

            model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter)
            sims = model.most_similar('graph', topn=10)

            self.assertEqual(model.wv.syn0.shape, (12, 10))
            self.assertEqual(len(model.wv.vocab), 12)
            self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
            self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
            self.model_sanity(model)

            # test querying for "most similar" by vector
            graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
            sims2 = model.most_similar(positive=[graph_vector], topn=11)
            sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
            self.assertEqual(sims, sims2)

            # verify oov-word vector retrieval
            invocab_vec = model['minors']  # invocab word
            self.assertEqual(len(invocab_vec), 10)

            oov_vec = model['minor']  # oov word
            self.assertEqual(len(oov_vec), 10)
コード例 #2
0
def load_file(title, out_path):
    path = '../../../datasets/newspapers_clean/{}'.format(title)
    print(path)
    allFiles = glob.glob(path + "/articles/*.tsv")
    print(allFiles)

    for f in allFiles:
        df = pd.read_csv(f, delimiter='\t', parse_dates=True)
        df = df.dropna(subset=['ocr'])  # remove lines with empty ocr field

        # remove duplicate header rows
        df = df[~df['date'].str.contains('date')]
        # remove files that contain error msg
        excludes = ['objecttype', 'file directory not found']
        df = df[~df['ocr'].astype(str).str.contains('|'.join(excludes))]

        df['date'] = pd.to_datetime(df['date'])
        year = df['date'].dt.year[0]

        print('making sentences: {}'.format(year))
        df['ocr'] = df['ocr'].apply(lambda x: unidecode.unidecode(x))
        docs = df['ocr'].values
        CORPUS_FILE = (out_path + '/{}_{}.txt'.format(title, year))

        save_as_line_sentence(process_corpus(docs), CORPUS_FILE)
コード例 #3
0
    def test_sg_neg_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            model_gensim = FT_gensim(
                size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
                min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
                sorted_vocab=1, workers=1, min_alpha=0.0)

            lee_data = LineSentence(datapath('lee_background.cor'))
            utils.save_as_line_sentence(lee_data, corpus_file)

            model_gensim.build_vocab(corpus_file=corpus_file)
            orig0 = np.copy(model_gensim.wv.vectors[0])
            model_gensim.train(corpus_file=corpus_file,
                               total_words=model_gensim.corpus_total_words,
                               epochs=model_gensim.epochs)
            self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

            sims_gensim = model_gensim.wv.most_similar('night', topn=10)
            sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
            expected_sims_words = [
                u'night.',
                u'night,',
                u'eight',
                u'overnight',
                u'overnight.',
                u'month',
                u'land',
                u'firm',
                u'singles',
                u'death']
            overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
            self.assertGreaterEqual(overlap_count, 2)
    def process_and_save(self):
        logging.info('Start processing of file.')
        try:
            text = codecs.open(os.path.join(tpath, self.input),
                               'r',
                               encoding='utf-8',
                               errors='ignore').readlines()
            text = remove_punctuation(text)
            text = remove_double_spaces(text)
            text = remove_noisy_digits(text)
            text = remove_dash_and_minus_signs(text)
            text = replace_digits(text)
            text = remove_double_spaces(text)
            text = reduce_numerical_sequences(text)
            text = filter_doc(text)
            text = [removeGermanChainWords(line) for line in text]
            logging.info('Chainword splitting finished')
            text = [remove_hyphens(line) for line in text]
            text = [lemmatizer.lemmatize(line) for line in text]
            logging.info('Lemmatizing finished')
            text = [lowercase(line) for line in text]
            text = [removeUmlauts(line) for line in text]
            text = [harmonizeSpelling(line) for line in text_preprocessing]
            if self.input.endswith('.txt'):
                save_as_line_sentence(text, f'{self.input[:-4]}_processed.txt')
            else:
                save_as_line_sentence(text, f'{self.input}_processed.txt')
            logging.info('Processing finished')

        except FileNotFoundError:
            print(f'File was not found.')
コード例 #5
0
    def test_save_as_line_sentence_ru(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')]
        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.smart_open(corpus_file, encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
コード例 #6
0
    def test_save_as_line_sentence_ru(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')]
        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.open(corpus_file, 'rb', encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
コード例 #7
0
    def test_save_as_line_sentence_en(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')]

        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.smart_open(corpus_file, encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
コード例 #8
0
    def test_save_as_line_sentence_en(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')]

        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.open(corpus_file, 'rb', encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
コード例 #9
0
    def test_online_learning_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
            self.assertTrue(len(model_hs.wv.vocab), 12)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
            model_hs.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            self.assertEqual(len(model_hs.wv.vocab), 14)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
            self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
コード例 #10
0
    def test_persistence_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model = FT_gensim(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, FT_gensim.load(tmpf))
            #  test persistence of the KeyedVectors of a model
            wv = model.wv
            wv.save(tmpf)
            loaded_wv = FastTextKeyedVectors.load(tmpf)
            self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
            self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
コード例 #11
0
    def test_online_learning_after_save_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
            model_neg.save(tmpf)
            model_neg = FT_gensim.load(tmpf)
            self.assertTrue(len(model_neg.wv.vocab), 12)
            model_neg.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words,
                            epochs=model_neg.iter)
            self.assertEqual(len(model_neg.wv.vocab), 14)
コード例 #12
0
    def process_and_save(self):
        logging.info(f'Start processing of files from folder {dirname}')
        i = 0
        files_total = len(os.listdir(self.dirname))
        logging.info(f'{files_total} files were found.')
        border = round(files_total / 10)
        for num in range(1, files_total + 1):
            if not os.path.isfile(
                    os.path.join(f'{self.dirname}_processed',
                                 f'{num}_sents.txt')):
                try:
                    text = codecs.open(os.path.join(self.dirname,
                                                    f'{num}_sents.txt'),
                                       'r',
                                       encoding='utf-8').readlines()
                    # Steps that are applied only when kind == 'BRD' have already been applied to Reichstag protocols when extracting them from the original documents
                    if self.kind == 'BRD':
                        regex_patterns = bundestag_patterns()
                        text = remove_punctuation(text)
                        text = remove_double_spaces(text)
                        text = extract_protocol_bundestag(
                            text, *regex_patterns)
                    text = remove_linebreaks(text)
                    if self.kind == 'BRD':
                        text = remove_noisy_digits(text)
                        text = remove_dash_and_minus_signs(text)
                    text = replace_digits(text)
                    text = remove_double_spaces(text)
                    text = reduce_numerical_sequences(text)
                    text = filter_doc(text)
                    text = [remove_german_chainwords(line) for line in text]
                    text = [remove_hyphens(line) for line in text]
                    text = [lemmatizer.lemmatize(line) for line in text]
                    text = [lowercase(line) for line in text]
                    text = [remove_umlauts(line) for line in text]
                    text = [
                        harmonizeSpelling(line, spelling_dict) for line in text
                    ]
                    save_as_line_sentence(
                        text, f'{self.dirname}_processed/{num}_sents.txt')
                    i += 1
                    if i % border == 0:
                        logging.info(
                            'Processing {:03.1f} percent finished'.format(
                                int((i / files_total) * 100)))

                except FileNotFoundError:
                    print(f'File {num} was not found.')
コード例 #13
0
def save_lee_corpus_as_line_sentence(corpus_file):
    utils.save_as_line_sentence((doc.words for doc in DocsLeeCorpus()),
                                corpus_file)
コード例 #14
0
def save_lee_corpus_as_line_sentence(corpus_file):
    utils.save_as_line_sentence((doc.words for doc in DocsLeeCorpus()), corpus_file)
コード例 #15
0
            word_count = 0
            for line in text_stream:
                # If under a set number of words, then include next comment
                if word_count < 35000000:
                    line = json.loads(line)
                    post = line['body']
                    sub = line['subreddit']
                    # Uncomment if only reading from specific subreddits
                    #if sub in subs:
                    if post not in remove:
                        # Process post as required for a word2vec corpus
                        processed_post = utils.simple_preprocess(post)
                        comment_size = len(processed_post)
                        # Ensure each comment reaches length threshold
                        if comment_size >= 10:
                            # Increase word and comment counts
                            comment_count += 1
                            word_count += comment_size
                            # Return
                            yield processed_post
                else:
                    break
            # Print word count upon completion
            print('Number of comments in corpus: {}'.format(comment_count))
            print('Number of total words in corpus: {}'.format(word_count))

corpus = MyCorpus()
# Save corpus as a line_sentence for a word2vec model to be made from
utils.save_as_line_sentence(corpus, r"C:\Users\Eric\Documents\COG 403\Project\Reddit\Data\Comments\RC_2019-09-news.txt")

コード例 #16
0
        directory = r'C:\Users\Eric\Documents\COG 403\Project\Reddit\Data\CNN\cnn\stories'
        story_count = 0
        word_count = 0

        for filename in os.listdir(directory):
            if word_count >= 35000000:
                break
            name = directory + '\\' + filename
            f = open(name, encoding='utf-8')
            f.readline()
            line = f.readline()
            while line == '\n':
                line = f.readline()
            while line[0] != '@':
                processed_line = utils.simple_preprocess(line)
                word_count += len(processed_line)
                line = f.readline()
                while line == '\n':
                    line = f.readline()
                yield processed_line
            story_count += 1
        print('Number of stories in corpus: {}'.format(story_count))
        print('Number of total words in corpus: {}'.format(word_count))


corpus = MyCorpus()
# Save corpus as a line_sentence for a word2vec model to be made from
utils.save_as_line_sentence(
    corpus,
    r"C:\Users\Eric\Documents\COG 403\Project\Reddit\Data\CNN\cnn_corpus.txt")