def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_get_vocab_word_vecs(self): model = FT_gensim(size=10, min_count=1, seed=42) model.build_vocab(sentences) original_syn0_vocab = np.copy(model.wv.syn0_vocab) model.trainables.get_vocab_word_vecs(model.wv) self.assertTrue( np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
def test_sg_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) utils.save_as_line_sentence(lee_data, corpus_file) model_gensim.build_vocab(corpus_file=corpus_file) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(corpus_file=corpus_file, total_words=model_gensim.corpus_total_words, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def test_cbow_neg_training(self): model_gensim = FT_gensim( size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'fight', u'month', u'hearings', u'Washington', u'remains', u'overnight', u'running'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def trainVectors(corpus, implementation, dim=300, min_n=3, max_n=6, min_count=1, model='skipgram', epochs=5, threads=12, window=5, lr=0.05, t=1e-4, neg=5): if implementation == 'fasttext': ### PARSE TRAIN DATA train_data = LineSentence(corpus) ### INITIALIZE MODEL model_gensim = FT_gensim(size=dim, min_n=min_n, max_n=max_n, min_count=min_count, iter=epochs, window=window) # BUILD VOCABULARY model_gensim.build_vocab(train_data) ### TRAIN THE MODEL model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter, model=model, threads=threads, lr=lr, t=t, neg=neg) elif implementation == 'w2v': ### PARSE TRAIN DATA train_data = LineSentence(corpus) ### TRAIN THE MODEL model_gensim = Word2Vec(size=dim, min_count=min_count, iter=epochs, window=window, workers=threads) # BUILD VOCABULARY model_gensim.build_vocab(train_data) ### TRAIN THE MODEL model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) return model_gensim
def test_persistence_word2vec_format(self): """Test storing/loading the model in word2vec format.""" tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst') model = FT_gensim(sentences, min_count=1, size=10) model.wv.save_word2vec_format(tmpf, binary=True) loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True) self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab)) self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
def test_online_learning(self): model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def trainFastTextModel(vectorSize, trainingModel): model = FT_gensim(size=vectorSize) model.build_vocab(corpus_file='Data/starGEO.txt') model.train(corpus_file='Data/starGEO.txt', epochs=model.epochs, model=trainingModel, total_examples=model.corpus_count, total_words=model.corpus_total_words) return model
def fasttext_embedding(source, method, emb_dim): model = FT_gensim(size=emb_dim, window=10, sg=1, min_count=5, workers=multiprocessing.cpu_count(), negative=10) # build the vocabulary model.build_vocab(sentences=Sentences(source)) # train the model model.train( sentences=Sentences(source), epochs=15, total_examples=model.corpus_count, total_words=model.corpus_total_words ) return model
def test_sg_neg_online(self): model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1) self.online_sanity(model)
def train_model(data_path, size_embeddings, epochs=64): corpus_file = datapath(data_path) model_gensim = FT_gensim(size=size_embeddings, workers=4) # build the vocabulary model_gensim.build_vocab(corpus_file=corpus_file) # train the model model_gensim.train(corpus_file=corpus_file, epochs=epochs, total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words) return model_gensim
def test_estimate_memory(self): model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3) model.build_vocab(sentences) report = model.estimate_memory() self.assertEqual(report['vocab'], 2800) self.assertEqual(report['syn0_vocab'], 160) self.assertEqual(report['syn1'], 160) self.assertEqual(report['syn1neg'], 160) self.assertEqual(report['syn0_ngrams'], 2240) self.assertEqual(report['buckets_word'], 640) self.assertEqual(report['total'], 6160)
def test_persistence(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def test_cbow_hs_online(self): model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1) self.online_sanity(model)
def FasttxModel(self): self.simple_model() # 转换模型 if os.path.exists('saved_model_gensim'): self.model = FT_gensim.load('saved_model_gensim') else: self.model = FT_gensim(size=100) self.model.build_vocab(self.sentences) self.model.train(self.sentences, total_examples=self.model.corpus_count, epochs=self.model.iter) self.model.save('saved_model_gensim')
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return tmpf = get_tmpfile('gensim_fasttext.tst') model_wrapper = FT_wrapper.train( ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=tmpf, model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0] ).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def test_cbow_neg_online(self): model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) self.online_sanity(model)
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
def test_persistence_word2vec_format(self): """Test storing/loading the model in word2vec format.""" tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst') model = FT_gensim(sentences, min_count=1, size=10) model.wv.save_word2vec_format(tmpf, binary=True) loaded_model_kv = keyedvectors.KeyedVectors.load_word2vec_format( tmpf, binary=True) self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab)) self.assertTrue(np.allclose(model['human'], loaded_model_kv['human'])) self.assertRaises(DeprecationWarning, FT_gensim.load_word2vec_format, tmpf) self.assertRaises(NotImplementedError, FastTextKeyedVectors.load_word2vec_format, tmpf)
def test_online_learning_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def main(): print('Instantiating the model') model = FT_gensim(size=100, window=5, min_count=5) # instantiate the model print('Building the vocabulary') model.build_vocab(sentences=MyIter()) total_examples = model.corpus_count print('Training the model') model.train(sentences=MyIter(), total_examples=total_examples, epochs=5) # train the model ## Save the model (can be loaded using gensim) print('Saving the model to specified filepath') save_file = sys.argv[2] model.save(save_file)
def test_persistence_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def test_norm_vectors_not_saved(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(tmpf) loaded_model = FT_gensim.load(tmpf) self.assertTrue(loaded_model.wv.vectors_norm is None) self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None) wv = model.wv wv.save(tmpf) loaded_kv = FastTextKeyedVectors.load(tmpf) self.assertTrue(loaded_kv.vectors_norm is None) self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
def generate_model(lang): model_gensim = FT_gensim(size=300) # build the vocabulary model_gensim.build_vocab(corpus_file='embedding/corpus_' + lang) # train the model model_gensim.train(corpus_file='embedding/corpus_' + lang, epochs=model_gensim.epochs, total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words) model_gensim.save('embedding/fasttext_' + lang + '.vec')
def test_online_learning_after_save_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def get_ft_model(documents, settings): model = FT_gensim(min_count=settings['min_count'], size=settings['size'], window=settings['window'], workers=40, sg=settings['sg'], negative=settings['negative'], iter=settings['iter']) model.build_vocab(documents) model.train(documents, total_examples=model.corpus_count, epochs=model.iter) return model
def createFastTextModel(data, isSG, vectorSize, maxNgram, modelFilePath): # train model model_gensim = FT_gensim(sg=isSG, size=vectorSize, min_count=1, min_n=1, max_n=maxNgram) # build the vocabulary model_gensim.build_vocab(data) # train the model model_gensim.train(data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) #save model_gensim.save(modelFilePath)
def test_online_learning_after_save(self): model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) self.assertTrue(len(model_neg.wv.ngrams), 271)