Exemple #1
0
    def test_training(self):
        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        model.build_vocab(sentences)
        self.model_sanity(model)

        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        sims = model.most_similar('graph', topn=10)

        self.assertEqual(model.wv.syn0.shape, (12, 10))
        self.assertEqual(len(model.wv.vocab), 12)
        self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
        self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
        self.model_sanity(model)

        # test querying for "most similar" by vector
        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        self.models_equal(model, model2)

        # verify oov-word vector retrieval
        invocab_vec = model['minors']  # invocab word
        self.assertEqual(len(invocab_vec), 10)

        oov_vec = model['minor']  # oov word
        self.assertEqual(len(oov_vec), 10)
Exemple #2
0
    def test_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
            model.build_vocab(corpus_file=corpus_file)
            self.model_sanity(model)

            model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter)
            sims = model.most_similar('graph', topn=10)

            self.assertEqual(model.wv.syn0.shape, (12, 10))
            self.assertEqual(len(model.wv.vocab), 12)
            self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
            self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
            self.model_sanity(model)

            # test querying for "most similar" by vector
            graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
            sims2 = model.most_similar(positive=[graph_vector], topn=11)
            sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
            self.assertEqual(sims, sims2)

            # verify oov-word vector retrieval
            invocab_vec = model['minors']  # invocab word
            self.assertEqual(len(invocab_vec), 10)

            oov_vec = model['minor']  # oov word
            self.assertEqual(len(oov_vec), 10)
Exemple #3
0
 def test_get_vocab_word_vecs(self):
     model = FT_gensim(size=10, min_count=1, seed=42)
     model.build_vocab(sentences)
     original_syn0_vocab = np.copy(model.wv.syn0_vocab)
     model.trainables.get_vocab_word_vecs(model.wv)
     self.assertTrue(
         np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
Exemple #4
0
    def test_sg_neg_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            model_gensim = FT_gensim(
                size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
                min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
                sorted_vocab=1, workers=1, min_alpha=0.0)

            lee_data = LineSentence(datapath('lee_background.cor'))
            utils.save_as_line_sentence(lee_data, corpus_file)

            model_gensim.build_vocab(corpus_file=corpus_file)
            orig0 = np.copy(model_gensim.wv.vectors[0])
            model_gensim.train(corpus_file=corpus_file,
                               total_words=model_gensim.corpus_total_words,
                               epochs=model_gensim.epochs)
            self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

            sims_gensim = model_gensim.wv.most_similar('night', topn=10)
            sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
            expected_sims_words = [
                u'night.',
                u'night,',
                u'eight',
                u'overnight',
                u'overnight.',
                u'month',
                u'land',
                u'firm',
                u'singles',
                u'death']
            overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
            self.assertGreaterEqual(overlap_count, 2)
Exemple #5
0
    def test_cbow_neg_training(self):

        model_gensim = FT_gensim(
            size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.vectors[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
        self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

        sims_gensim = model_gensim.wv.most_similar('night', topn=10)
        sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
        expected_sims_words = [
            u'night.',
            u'night,',
            u'eight',
            u'fight',
            u'month',
            u'hearings',
            u'Washington',
            u'remains',
            u'overnight',
            u'running']
        overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
        self.assertGreaterEqual(overlap_count, 2)
Exemple #6
0
def trainVectors(corpus, implementation, dim=300, min_n=3, max_n=6, min_count=1, model='skipgram', epochs=5, threads=12, window=5, lr=0.05, t=1e-4, neg=5):

    if implementation == 'fasttext':

        ### PARSE TRAIN DATA
        train_data = LineSentence(corpus)
        ### INITIALIZE MODEL
        model_gensim = FT_gensim(size=dim, min_n=min_n, max_n=max_n, min_count=min_count, iter=epochs, window=window)
        # BUILD VOCABULARY
        model_gensim.build_vocab(train_data)
        ### TRAIN THE MODEL
        model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter, model=model, threads=threads, lr=lr, t=t, neg=neg)

    elif implementation == 'w2v':

    	### PARSE TRAIN DATA
        train_data = LineSentence(corpus)
        ### TRAIN THE MODEL
        model_gensim = Word2Vec(size=dim, min_count=min_count, iter=epochs, window=window, workers=threads)
        # BUILD VOCABULARY
        model_gensim.build_vocab(train_data)
        ### TRAIN THE MODEL
        model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)

    return model_gensim
Exemple #7
0
 def test_persistence_word2vec_format(self):
     """Test storing/loading the model in word2vec format."""
     tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
     model = FT_gensim(sentences, min_count=1, size=10)
     model.wv.save_word2vec_format(tmpf, binary=True)
     loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
     self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
     self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
Exemple #8
0
 def test_online_learning(self):
     model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
     self.assertTrue(len(model_hs.wv.vocab), 12)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
     model_hs.build_vocab(new_sentences, update=True)  # update vocab
     self.assertEqual(len(model_hs.wv.vocab), 14)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
     self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
Exemple #9
0
 def test_online_learning_after_save(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(tmpf)
     model_neg = FT_gensim.load(tmpf)
     self.assertTrue(len(model_neg.wv.vocab), 12)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
Exemple #10
0
def trainFastTextModel(vectorSize, trainingModel):
    model = FT_gensim(size=vectorSize)
    model.build_vocab(corpus_file='Data/starGEO.txt')
    model.train(corpus_file='Data/starGEO.txt',
                epochs=model.epochs,
                model=trainingModel,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words)
    return model
Exemple #11
0
def fasttext_embedding(source, method, emb_dim):
    model = FT_gensim(size=emb_dim, window=10, sg=1, min_count=5, workers=multiprocessing.cpu_count(), negative=10)
    # build the vocabulary
    model.build_vocab(sentences=Sentences(source))
    # train the model
    model.train(
        sentences=Sentences(source), epochs=15, 
        total_examples=model.corpus_count, total_words=model.corpus_total_words
    )
    return model
Exemple #12
0
 def test_sg_neg_online(self):
     model = FT_gensim(sg=1,
                       window=2,
                       hs=0,
                       negative=5,
                       min_count=3,
                       iter=1,
                       seed=42,
                       workers=1)
     self.online_sanity(model)
Exemple #13
0
def train_model(data_path, size_embeddings, epochs=64):
    corpus_file = datapath(data_path)
    model_gensim = FT_gensim(size=size_embeddings, workers=4)
    # build the vocabulary
    model_gensim.build_vocab(corpus_file=corpus_file)
    # train the model
    model_gensim.train(corpus_file=corpus_file,
                       epochs=epochs,
                       total_examples=model_gensim.corpus_count,
                       total_words=model_gensim.corpus_total_words)
    return model_gensim
Exemple #14
0
 def test_estimate_memory(self):
     model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
     model.build_vocab(sentences)
     report = model.estimate_memory()
     self.assertEqual(report['vocab'], 2800)
     self.assertEqual(report['syn0_vocab'], 160)
     self.assertEqual(report['syn1'], 160)
     self.assertEqual(report['syn1neg'], 160)
     self.assertEqual(report['syn0_ngrams'], 2240)
     self.assertEqual(report['buckets_word'], 640)
     self.assertEqual(report['total'], 6160)
Exemple #15
0
 def test_persistence(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model = FT_gensim(sentences, min_count=1)
     model.save(tmpf)
     self.models_equal(model, FT_gensim.load(tmpf))
     #  test persistence of the KeyedVectors of a model
     wv = model.wv
     wv.save(tmpf)
     loaded_wv = FastTextKeyedVectors.load(tmpf)
     self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
     self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
Exemple #16
0
 def test_cbow_hs_online(self):
     model = FT_gensim(sg=0,
                       cbow_mean=1,
                       alpha=0.05,
                       window=2,
                       hs=1,
                       negative=0,
                       min_count=3,
                       iter=1,
                       seed=42,
                       workers=1)
     self.online_sanity(model)
Exemple #17
0
 def FasttxModel(self):
     self.simple_model()
     # 转换模型
     if os.path.exists('saved_model_gensim'):
         self.model = FT_gensim.load('saved_model_gensim')
     else:
         self.model = FT_gensim(size=100)
         self.model.build_vocab(self.sentences)
         self.model.train(self.sentences,
                          total_examples=self.model.corpus_count,
                          epochs=self.model.iter)
         self.model.save('saved_model_gensim')
Exemple #18
0
    def test_sg_hs_against_wrapper(self):
        if self.ft_path is None:
            logger.info("FT_HOME env variable not set, skipping test")
            return

        tmpf = get_tmpfile('gensim_fasttext.tst')
        model_wrapper = FT_wrapper.train(
            ft_path=self.ft_path,
            corpus_file=datapath('lee_background.cor'),
            output_file=tmpf,
            model='skipgram',
            size=50,
            alpha=0.025,
            window=5,
            min_count=5,
            word_ngrams=1,
            loss='hs',
            sample=1e-3,
            negative=0,
            iter=5,
            min_n=3,
            max_n=6,
            sorted_vocab=1,
            threads=12)

        model_gensim = FT_gensim(size=50,
                                 sg=1,
                                 cbow_mean=1,
                                 alpha=0.025,
                                 window=5,
                                 hs=1,
                                 negative=0,
                                 min_count=5,
                                 iter=5,
                                 batch_words=1000,
                                 word_ngrams=1,
                                 sample=1e-3,
                                 min_n=3,
                                 max_n=6,
                                 sorted_vocab=1,
                                 workers=1,
                                 min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.syn0[0])
        model_gensim.train(lee_data,
                           total_examples=model_gensim.corpus_count,
                           epochs=model_gensim.iter)
        self.assertFalse((orig0 == model_gensim.wv.syn0[0]
                          ).all())  # vector should vary after training
        self.compare_with_wrapper(model_gensim, model_wrapper)
 def test_cbow_neg_online(self):
     model = FT_gensim(sg=0,
                       cbow_mean=1,
                       alpha=0.05,
                       window=2,
                       hs=0,
                       negative=5,
                       min_count=5,
                       iter=1,
                       seed=42,
                       workers=12,
                       sample=0)
     self.online_sanity(model)
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
Exemple #21
0
 def test_persistence_word2vec_format(self):
     """Test storing/loading the model in word2vec format."""
     tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
     model = FT_gensim(sentences, min_count=1, size=10)
     model.wv.save_word2vec_format(tmpf, binary=True)
     loaded_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(
         tmpf, binary=True)
     self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
     self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
     self.assertRaises(DeprecationWarning, FT_gensim.load_word2vec_format,
                       tmpf)
     self.assertRaises(NotImplementedError,
                       FastTextKeyedVectors.load_word2vec_format, tmpf)
Exemple #22
0
    def test_online_learning_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
            self.assertTrue(len(model_hs.wv.vocab), 12)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
            model_hs.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            self.assertEqual(len(model_hs.wv.vocab), 14)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
            self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
Exemple #23
0
def main():
    print('Instantiating the model')
    model = FT_gensim(size=100, window=5, min_count=5)  # instantiate the model
    print('Building the vocabulary')
    model.build_vocab(sentences=MyIter())
    total_examples = model.corpus_count
    print('Training the model')
    model.train(sentences=MyIter(), total_examples=total_examples,
                epochs=5)  # train the model

    ## Save the model (can be loaded using gensim)
    print('Saving the model to specified filepath')
    save_file = sys.argv[2]
    model.save(save_file)
Exemple #24
0
    def test_persistence_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model = FT_gensim(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, FT_gensim.load(tmpf))
            #  test persistence of the KeyedVectors of a model
            wv = model.wv
            wv.save(tmpf)
            loaded_wv = FastTextKeyedVectors.load(tmpf)
            self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
            self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
Exemple #25
0
    def test_norm_vectors_not_saved(self):
        tmpf = get_tmpfile('gensim_fasttext.tst')
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(tmpf)
        loaded_model = FT_gensim.load(tmpf)
        self.assertTrue(loaded_model.wv.vectors_norm is None)
        self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None)

        wv = model.wv
        wv.save(tmpf)
        loaded_kv = FastTextKeyedVectors.load(tmpf)
        self.assertTrue(loaded_kv.vectors_norm is None)
        self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
Exemple #26
0
def generate_model(lang):

    model_gensim = FT_gensim(size=300)

    # build the vocabulary
    model_gensim.build_vocab(corpus_file='embedding/corpus_' + lang)

    # train the model
    model_gensim.train(corpus_file='embedding/corpus_' + lang,
                       epochs=model_gensim.epochs,
                       total_examples=model_gensim.corpus_count,
                       total_words=model_gensim.corpus_total_words)

    model_gensim.save('embedding/fasttext_' + lang + '.vec')
Exemple #27
0
    def test_online_learning_after_save_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
            model_neg.save(tmpf)
            model_neg = FT_gensim.load(tmpf)
            self.assertTrue(len(model_neg.wv.vocab), 12)
            model_neg.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words,
                            epochs=model_neg.iter)
            self.assertEqual(len(model_neg.wv.vocab), 14)
def get_ft_model(documents, settings):
    model = FT_gensim(min_count=settings['min_count'],
                      size=settings['size'],
                      window=settings['window'],
                      workers=40,
                      sg=settings['sg'],
                      negative=settings['negative'],
                      iter=settings['iter'])

    model.build_vocab(documents)

    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.iter)
    return model
def createFastTextModel(data, isSG, vectorSize, maxNgram, modelFilePath):
    # train model
    model_gensim = FT_gensim(sg=isSG,
                             size=vectorSize,
                             min_count=1,
                             min_n=1,
                             max_n=maxNgram)
    # build the vocabulary
    model_gensim.build_vocab(data)
    # train the model
    model_gensim.train(data,
                       total_examples=model_gensim.corpus_count,
                       epochs=model_gensim.iter)
    #save
    model_gensim.save(modelFilePath)
 def test_online_learning_after_save(self):
     model_neg = FT_gensim(sentences,
                           size=10,
                           min_count=0,
                           seed=42,
                           hs=0,
                           negative=5)
     model_neg.save(testfile())
     model_neg = FT_gensim.load(testfile())
     self.assertTrue(len(model_neg.wv.vocab), 12)
     self.assertTrue(len(model_neg.wv.ngrams), 202)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences,
                     total_examples=model_neg.corpus_count,
                     epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
     self.assertTrue(len(model_neg.wv.ngrams), 271)