def testLoadOldModel(self): """Test loading fasttext models from previous version""" model_file = 'fasttext_old' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) self.assertTrue(model.vocabulary.cum_table.shape == (12, )) self.assertEqual(len(model.wv.hash2index), 202) self.assertTrue(model.wv.vectors_vocab.shape == (12, 100)) self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100)) # Model stored in multiple files model_file = 'fasttext_old_sep' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) self.assertTrue(model.vocabulary.cum_table.shape == (12, )) self.assertEqual(len(model.wv.hash2index), 202) self.assertTrue(model.wv.vectors_vocab.shape == (12, 100)) self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))
def test_sg_neg_training(self): model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def test_online_learning(self): model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def test_estimate_memory(self): model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3) model.build_vocab(sentences) report = model.estimate_memory() self.assertEqual(report['vocab'], 2800) self.assertEqual(report['syn0_vocab'], 160) self.assertEqual(report['syn1'], 160) self.assertEqual(report['syn1neg'], 160) self.assertEqual(report['syn0_ngrams'], 2240) self.assertEqual(report['buckets_word'], 640) self.assertEqual(report['total'], 6160)
def test_persistence(self): model = FT_gensim(sentences, min_count=1) model.save(testfile()) self.models_equal(model, FT_gensim.load(testfile())) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(testfile()) loaded_wv = FastTextKeyedVectors.load(testfile()) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
def test_online_learning(self): model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(len(model_hs.wv.ngrams), 202) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) self.assertFalse('tif' in model_hs.wv.ngrams) model_hs.build_vocab(new_sentences, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(len(model_hs.wv.ngrams), 271) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) self.assertTrue('tif' in model_hs.wv.ngrams) # ngram added because of the word `artificial`
def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: model[u'který'] except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word')
def test_load_model_non_utf8_encoding(self): model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: model[u'který'] except KeyError: self.fail('Unable to access vector for cp-852 word')
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def test_online_learning_after_save(self): model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) self.assertTrue(len(model_neg.wv.ngrams), 271)
def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def test_load_fasttext_new_format(self): try: new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) vocab_size, model_size = 1763, 10 self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) expected_vec = [ -0.025627, -0.11448, 0.18116, -0.96779, 0.2532, -0.93224, 0.3929, 0.12679, -0.19685, -0.13179 ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4)) # vector for oov words are slightly different from original FastText due to discarding unused ngrams # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin expected_vec_oov = [ -0.53378, -0.19, 0.013482, -0.86767, -0.21684, -0.89928, 0.45124, 0.18025, -0.14128, 0.22508 ] self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4)) self.assertEqual(new_model.min_count, 5) self.assertEqual(new_model.window, 5) self.assertEqual(new_model.iter, 5) self.assertEqual(new_model.negative, 5) self.assertEqual(new_model.sample, 0.0001) self.assertEqual(new_model.bucket, 1000) self.assertEqual(new_model.wv.max_n, 6) self.assertEqual(new_model.wv.min_n, 3) self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size)) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size))
def test_load_fasttext_format(self): try: model = FT_gensim.load_fasttext_format(self.test_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) vocab_size, model_size = 1762, 10 self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(model.wv.vocab), vocab_size, model_size) self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) expected_vec = [ -0.57144, -0.0085561, 0.15748, -0.67855, -0.25459, -0.58077, -0.09913, 1.1447, 0.23418, 0.060007 ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4)) # vector for oov words are slightly different from original FastText due to discarding unused ngrams # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin expected_vec_oov = [ -0.23825, -0.58482, -0.22276, -0.41215, 0.91015, -1.6786, -0.26724, 0.58818, 0.57828, 0.75801 ] self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4)) self.assertEqual(model.min_count, 5) self.assertEqual(model.window, 5) self.assertEqual(model.iter, 5) self.assertEqual(model.negative, 5) self.assertEqual(model.sample, 0.0001) self.assertEqual(model.bucket, 1000) self.assertEqual(model.wv.max_n, 6) self.assertEqual(model.wv.min_n, 3) self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size))
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
import os from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim from fileObject import FileObj file_obj = FileObj(r"testSet/data") sentences = file_obj.read_lines_1_words() """ model_gensim = FT_gensim(size=100) model_gensim.build_vocab(sentences) model_gensim.train(sentences, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) model_gensim.save('saved_model_gensim') """ loaded_model = FT_gensim.load('saved_model_gensim') print(loaded_model) print(loaded_model.most_similar('老人')) print(loaded_model.doesnt_match("老人 小孩 孕妇 胃疼".split(" "))) #sentence_obama = ["老人","高血压","怎么办"] #sentence_president = ["青年","高血压","怎么办"] #distance = loaded_model.wmdistance(sentence_obama, sentence_president) #print(distance) sentence_query = ["晚上", "经常", "失眠", "怎么办"] sim_max = 0 sim_index = 0 sim_list = [] for i in range(0, sentences.__len__()):
#print(corpus) wpt = nltk.WordPunctTokenizer() tokenized_corpus = [wpt.tokenize(doc) for doc in corpus] print(tokenized_corpus[1]) #Using Fasttext.............. feature_size = 50 window_context = 10 min_word_count = 5 sample = 1e-3 fasttext_model = FastText(tokenized_corpus, size=feature_size, window=window_context, min_count=min_word_count, sample=sample, sg=1, iter=20) #1-skipgram and 0-CBOW print(fasttext_model.wv['god']) #Finding similar words.............. similar_words = { search_term: [ item[0] for item in fasttext_model.wv.most_similar([search_term], topn=5) ] for search_term in ['god', 'jesus', 'egypt', 'john'] } print(similar_words)
def load_old_fasttext(*args, **kwargs): old_model = FastText.load(*args, **kwargs) params = { 'size': old_model.vector_size, 'alpha': old_model.alpha, 'window': old_model.window, 'min_count': old_model.min_count, 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), 'sample': old_model.sample, 'seed': old_model.seed, 'workers': old_model.workers, 'min_alpha': old_model.min_alpha, 'sg': old_model.sg, 'hs': old_model.hs, 'negative': old_model.negative, 'cbow_mean': old_model.cbow_mean, 'hashfxn': old_model.hashfxn, 'iter': old_model.iter, 'null_word': old_model.null_word, 'sorted_vocab': old_model.sorted_vocab, 'batch_words': old_model.batch_words, 'min_n': old_model.min_n, 'max_n': old_model.max_n, 'word_ngrams': old_model.word_ngrams, 'bucket': old_model.bucket } new_model = NewFastText(**params) # set trainables attributes new_model.wv.vectors = old_model.wv.syn0 new_model.wv.vectors_vocab = old_model.wv.syn0_vocab new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams if hasattr(old_model.wv, 'syn0norm'): new_model.wv.vectors_norm = old_model.wv.syn0norm if hasattr(old_model, 'syn1'): new_model.trainables.syn1 = old_model.syn1 if hasattr(old_model, 'syn1neg'): new_model.trainables.syn1neg = old_model.syn1neg if hasattr(old_model, 'syn0_lockf'): new_model.trainables.vectors_lockf = old_model.syn0_lockf if hasattr(old_model, 'syn0_vocab_lockf'): new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf if hasattr(old_model, 'syn0_ngrams_lockf'): new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf if hasattr(old_model.wv, 'syn0_vocab_norm'): new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm if hasattr(old_model.wv, 'syn0_ngrams_norm'): new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm # set vocabulary attributes new_model.wv.vocab = old_model.wv.vocab new_model.wv.index2word = old_model.wv.index2word new_model.vocabulary.cum_table = old_model.cum_table new_model.wv.hash2index = old_model.wv.hash2index new_model.train_count = old_model.train_count new_model.corpus_count = old_model.corpus_count new_model.corpus_total_words = old_model.corpus_total_words new_model.running_training_loss = old_model.running_training_loss new_model.total_train_time = old_model.total_train_time new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached new_model.model_trimmed_post_training = old_model.model_trimmed_post_training new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors return new_model
def test_bucket_ngrams(self): model = FT_gensim(size=10, min_count=1, bucket=20) model.build_vocab(sentences) self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10)) model.build_vocab(new_sentences, update=True) self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10))
def test_get_vocab_word_vecs(self): model = FT_gensim(size=10, min_count=1, seed=42) model.build_vocab(sentences) original_syn0_vocab = np.copy(model.wv.syn0_vocab) model.trainables.get_vocab_word_vecs(model.wv) self.assertTrue(np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
tree_t_2 = etree.parse("a_corpus_tokens_2.xml") root_t_2 = tree_t_2.getroot() all_1 = root_t_1.findall(".//s") all_2 = root_t_2.findall(".//s") all_3 = root_t_3.findall(".//s") all_sent = all_1 + all_2 + all_3 class MyCorpus(object): def __iter__(self): for sentences in all_sent: yield ([entities.text.lower() for entities in sentences]) from pprint import pprint as print from gensim.models.fasttext import FastText from gensim.test.utils import datapath model = FastText(size=200, min_n = 3,window = 7 ) model.build_vocab(sentences=MyCorpus()) model.train( sentences=MyCorpus(), epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) from gensim.test.utils import get_tmpfile fname = get_tmpfile("lower_better_bigger.model") model.save(fname) model = FastText.load(fname) import gensim.models sentences = MyCorpus() model_vec2 = gensim.models.Word2Vec(sentences=sentences, size=200) from gensim.test.utils import get_tmpfile fname = get_tmpfile("lower_better_bigger_2vec.model") model_vec2.save(fname)
def load_word2vec(self,model_dir: str = config["paths"]["pretrain_dir"][plat][user],model_type: str = 'googlenews', encoding: str = 'latin-1',model_file_name: str = "GoogleNews-vectors-negative300.bin") ->\ gensim.models.keyedvectors.Word2VecKeyedVectors: """ Loads Word2Vec model and returns initial weights for embedding layer. inputs: model_type # GoogleNews / glove embedding_dim # Word vector dimensionality """ if self.pretrain_model is not None: return self.pretrain_model assert exists( join(model_dir, model_file_name)), "Model file not found at: [{}].".format( join(model_dir, model_file_name)) logger.debug("Using [{0}] model from [{1}]".format( model_type, join(model_dir, model_file_name))) if model_type == 'googlenews' or model_type == "fasttext_wiki": if exists(join(model_dir, model_file_name + '.bin')): try: pretrain_model = FastText.load_fasttext_format( join(model_dir, model_file_name + '.bin')) ## For original fasttext *.bin format. except Exception as e: pretrain_model = KeyedVectors.load_word2vec_format( join(model_dir, model_file_name + '.bin'), binary=True, encoding=encoding) else: try: pretrain_model = KeyedVectors.load_word2vec_format( join(model_dir, model_file_name), binary=self.binary) except Exception as e: ## On exception, trying a different format. logger.info( 'Loading original word2vec format failed. Trying Gensim format.' ) pretrain_model = KeyedVectors.load( join(model_dir, model_file_name)) ## Save model in binary format for faster loading in future. pretrain_model.save_word2vec_format(join( model_dir, model_file_name + ".bin"), binary=True) logger.info("Saved binary model at: [{0}]".format( join(model_dir, model_file_name + ".bin"))) logger.info(type(pretrain_model)) elif model_type == 'glove': logger.info('Loading existing Glove model: [{0}]'.format( join(model_dir, model_file_name))) from gensim.scripts.glove2word2vec import glove2word2vec from gensim.test.utils import datapath, get_tmpfile glove_file = datapath(join(model_dir, model_file_name)) tmp_file = get_tmpfile( join(model_dir, model_file_name + "_word2vec")) _ = glove2word2vec(glove_file, tmp_file) pretrain_model = KeyedVectors.load_word2vec_format(tmp_file) elif model_type == "bert_multi": bert_model = BertModel.from_pretrained('bert-base-uncased') bert_model.eval() # pretrain_model = FastText.load_fasttext_format(join(model_dir,model_file_name)) # pretrain_model = FastText.load_binary_data (join(model_dir,model_file_name)) pretrain_model = KeyedVectors.load_word2vec_format(join( model_dir, model_file_name), binary=False) # import io # fin = io.open(join(model_dir, model_file_name), encoding=encoding, newline=newline, # errors=errors) # n, d = map(int, fin.readline().split()) # pretrain_model = OrderedDict() # for line in fin: # tokens = line.rstrip().split(' ') # pretrain_model[tokens[0]] = map(float, tokens[1:]) """embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False) embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True) embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True)""" else: raise ValueError('Unknown pretrain model type: %s!' % model_type) self.pretrain_model = pretrain_model return self.pretrain_model
macro_cols = [x + '_' + str(i) for x in macro_columns for i in range(N_parts)] macro_stress_df = create_overall( pd.DataFrame(macro_stress, columns=macro_cols), 'max', N_parts) macro_df = create_overall(pd.DataFrame(macro, columns=macro_cols), 'max', N_parts) data = pd.concat([data.loc[:, :'BENEFICIAR_ORG'], macro_df], axis=1) data_stress = pd.concat( [data_stress.loc[:, :'BENEFICIAR_ORG'], macro_stress_df], axis=1) pd.Series('\t'.join(data['PRINCIPAL_NAME'] + ' ' + data['BENEFICIAR_NAME'])).to_csv('text.txt', sep='\t', index=None) string = LineSentence('text.txt') fasttext = FastText(size=50, sg=0, word_ngrams=2, iter=10, min_n=2, max_n=10) fasttext.build_vocab(string) fasttext.train(string, total_examples=fasttext.corpus_count, epochs=fasttext.iter) def to_sent_emb(string): return np.sum([fasttext[x] for x in string.split(' ') if len(x) > 1], axis=0) emb_df = pd.DataFrame( np.vstack((data['PRINCIPAL_NAME'] + ' ' + data['BENEFICIAR_NAME']).apply(to_sent_emb))) emb_df.columns = ['emb_' + str(w) for w in emb_df.columns]
def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new')
def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): FT_gensim.load_fasttext_format( datapath('pang_lee_polarity_fasttext'))
from gensim.models.fasttext import FastText as FT_gensim from gensim.test.utils import datapath import numpy as np #from gensim.models.fasttext import FastText, load_facebook_vectors 이 부분 잘 살펴보면 update할 수 있을 지도 cap_path = datapath( '/Users/hanjaewon/폴더모음/학교생활/졸업과제/fasttextTest/model_report' ) model = FT_gensim.load(cap_path) corpus_file = datapath( '/Users/hanjaewon/폴더모음/학교생활/졸업과제/fasttextTest/new.txt') #model.build_vocab(sentences=common_texts) #model.build_vocab(sentences=text) #model.train(sentences=text, total_examples=len(text), epochs=model.epochs) print('채스가 vocab안에 있는가?') print('채스' in model.wv.vocab) print(model.corpus_count) #print('채스의 vector는?') #print(model.wv.__getitem__('채스')) model.build_vocab(corpus_file=corpus_file, update=True) model.train(corpus_file=corpus_file, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) print('----------업데이트 이후----------') print('채스가 vocab안에 있는가?') print('채스' in model.wv.vocab) print(model.corpus_count) #print('채스의 vector는?') #print(model.wv.__getitem__('채스'))
sentences = list(CreateCorpus(args.protocols)) if args.model_architecture == 'word2vec': model = Word2Vec(sentences=sentences, size=args.size, window=args.window, min_count=args.min_count, workers=args.threads, sg=args.sg, hs=args.hs, negative=args.ns) elif args.model_architecture == 'fasttext': model = FT_gensim(size=args.size, window=args.window, min_count=args.min_count, workers=args.threads, sg=args.sg, hs=args.hs, negative=args.ns) # build the vocabulary model.build_vocab(sentences) # train the model model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) elapsed = time.time() logging.info(f'Training finished. Took {elapsed-start} s')
import os import pickle import numpy as np import sys # from smart_bug import Statement_Norm, Statement_Vec # from smart_bug import Statement_Vec from gensim.models.fasttext import FastText from scipy.spatial.distance import pdist, cdist, squareform from scipy.spatial import distance BUG_FASTTEXT_MODEL = FastText.load( "../statement_level/Model/FastText/fasttext_model") print("Statement FastText Model loaded") sys.path.append('../statement_level/Normalize') from statement_normalization import Statement_Norm sys.path.append('../statement_level/Vectorize/') from statement_vectorize import Statement_Vec def save_to_file(messagecontent): if not os.path.exists('./USERINPUT'): os.makedirs('./USERINPUT') with open('./USERINPUT/current.sol', 'w') as handle: handle.write(messagecontent) def parser(): # cmd = "java -classpath ./State_Parse/antlr4.jar:./State_Parse/target/ Tokenize ./Bug/current.sol ./Bug/" cmd = "java -classpath ../statement_level/Parse/antlr4.jar:../statement_level/Parse/target/ Tokenize ./USERINPUT/current.sol ./STATEMENT_RESULT/" os.system(cmd)
def genex_reviews(): for i in df['Review'].values: yield simple_preprocess(i) reviews = df['Review'].values # reviews = genex_reviews() # default window size is 5 (two words before and two words after the input word, in addition to the input word itself) # training the cbow (Continuous Bag of Words) model model_cbow = Word2Vec(reviews, window=10, min_count=2, workers=10) model_cbow.train(reviews, total_examples=len(reviews), epochs=50) # training the char n-gram model (subword information) with fastText model_subword = FastText(reviews, window=10, min_count=2, workers=10, min_n=3, max_n=6) model_subword.train(reviews, total_examples=len(reviews), epochs=50) # training the SkipGram model model_skipgram = Word2Vec(reviews, window=10, min_count=2, workers=10, sg=1) model_skipgram.train(reviews, total_examples=len(reviews), epochs=50) # saving the models model_cbow.save("cbow.model") model_subword.save("fasttext.model") model_skipgram.save("skipgram.model") # saving the word vectors model_cbow.wv.save("cbow_vector.bin") model_subword.wv.save("subword_vector.bin") model_skipgram.wv.save("skipgram_vector.bin")
def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
def __init__(self, paraphrases): self.embeddings = FastText.load(EMBEDDINGS_FOLDER + EMBEDDINGS_FILE, mmap='r') self.paraphrases = paraphrases #self.stemmer = SnowballStemmer('russian') self.stemmer = Mystem()
def fit_model(self, corpus: List): self.model = FastText(sentences=corpus, **self.additional_parameters)
""" Script for ranking a list of pubmed results by date, ft, ... """ from gensim.models.fasttext import FastText from utils import preprocess import numpy as np import nltk from nltk.corpus import stopwords ft_model = FastText.load('models/ft/med_model_dim300_win5_min100.bin') stops = set(stopwords.words('english')) def _remove_stopwords(sentence): """ :param sentence: list of words :return: list of words """ if isinstance(sentence, list): return [word for word in sentence if not word in stops] else: sentence = sentence.split() sentence = [word for word in sentence if not word in stops] sentence = ' '.join(sentence) return sentence def similarity(w1, w2):
def test_bucket_ngrams(self): model = FT_gensim(size=10, min_count=1, bucket=20) model.build_vocab(sentences) self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10)) model.build_vocab(new_sentences, update=True) self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))
def __init__(self, model_name=None, storage=None, *args, **kwargs): BaseEmbeddingModel.__init__(self, model_name=model_name, storage=storage) FT_gensim.__init__(self, *args, **kwargs) self.model_type = FT_gensim.__name__.lower()
def fit(self, X: np.array, y: csr_matrix): if self.verbose: #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # TODO revert this pass X_splitted = np.array([s.split() for s in X]) #docs = [TaggedDocument(words=tokens, tags=[index]) for index, tokens in enumerate(X_splitted)] if self.model.lower() == 'fasttext': self.wv_model_ = FastText(sentences=X_splitted.tolist(), size=self.embedding_dim, iter=self.epochs, min_count=self.min_count, window=self.window_size, workers=self.n_jobs) elif self.model.lower() == 'doc2vec': self.wv_model_ = Word2Vec( sentences=X_splitted.tolist(), size=self.embedding_dim, iter=self.epochs, min_count=self.min_count, window=self.window_size, workers=self.n_jobs, ) else: raise NotImplementedError tag_doc_mapping = self._create_tag_docs(y) if self.tfidf_weighting: self.tfidf_ = TfidfVectorizer() self.texts_tfidf_ = self.tfidf_.fit_transform(X) self.tag_embeddings_ = np.empty((y.shape[1], self.embedding_dim), dtype='float64') if self.verbose: tac_doc_iterator = tqdm(enumerate(tag_doc_mapping), desc='Computing tag embeddings') else: tac_doc_iterator = enumerate(tag_doc_mapping) for tag_id, texts_idx in tac_doc_iterator: # will be of shape(n_texts, embedding_dim) tag_word_embeddings = [] for text_ind in texts_idx: for token in list(set(X_splitted[text_ind])): try: word_embedding = self.wv_model_.wv[token] except KeyError: # if words occur that are ignored due to min_count continue if self.tfidf_weighting: token_ind = self.tfidf_.vocabulary_.get(token, -1) if token_ind > -1: tfidf_value = self.texts_tfidf_[text_ind, token_ind] word_embedding = word_embedding * tfidf_value tag_word_embeddings.append(word_embedding) self.tag_embeddings_[tag_id] = self.pooling_func( tag_word_embeddings) return self
def load_old_fasttext(*args, **kwargs): old_model = FastText.load(*args, **kwargs) params = { 'size': old_model.vector_size, 'alpha': old_model.alpha, 'window': old_model.window, 'min_count': old_model.min_count, 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), 'sample': old_model.sample, 'seed': old_model.seed, 'workers': old_model.workers, 'min_alpha': old_model.min_alpha, 'sg': old_model.sg, 'hs': old_model.hs, 'negative': old_model.negative, 'cbow_mean': old_model.cbow_mean, 'hashfxn': old_model.hashfxn, 'iter': old_model.iter, 'null_word': old_model.null_word, 'sorted_vocab': old_model.sorted_vocab, 'batch_words': old_model.batch_words, 'min_n': old_model.min_n, 'max_n': old_model.max_n, 'word_ngrams': old_model.word_ngrams, 'bucket': old_model.bucket } new_model = NewFastText(**params) # set trainables attributes new_model.wv.vectors = old_model.wv.syn0 new_model.wv.vectors_vocab = old_model.wv.syn0_vocab new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams if hasattr(old_model.wv, 'syn0norm'): new_model.wv.vectors_norm = old_model.wv.syn0norm if hasattr(old_model, 'syn1'): new_model.trainables.syn1 = old_model.syn1 if hasattr(old_model, 'syn1neg'): new_model.trainables.syn1neg = old_model.syn1neg if hasattr(old_model, 'syn0_lockf'): new_model.trainables.vectors_lockf = old_model.syn0_lockf if hasattr(old_model, 'syn0_vocab_lockf'): new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf if hasattr(old_model, 'syn0_ngrams_lockf'): new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf if hasattr(old_model.wv, 'syn0_vocab_norm'): new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm if hasattr(old_model.wv, 'syn0_ngrams_norm'): new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm # set vocabulary attributes new_model.wv.vocab = old_model.wv.vocab new_model.wv.index2word = old_model.wv.index2word new_model.vocabulary.cum_table = old_model.cum_table new_model.wv.hash2index = old_model.wv.hash2index new_model.train_count = old_model.train_count new_model.corpus_count = old_model.corpus_count new_model.running_training_loss = old_model.running_training_loss new_model.total_train_time = old_model.total_train_time new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached new_model.model_trimmed_post_training = old_model.model_trimmed_post_training new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors return new_model
import torch import jieba from torch import nn from pytorch_pretrained_bert import BertModel, BertAdam from gensim.models.fasttext import FastText from tqdm import tqdm from pyltp import Postagger MODEL_PATH = 'bert-model' jieba.load_userdict('bert-model/dict-traditional.txt') seq_len = 512 # Load vocabularies print('建構詞向量...') word2vec = FastText.load_fasttext_format('bert-model/wordvec-large.dim1024') vocab = {} id2vocab = {} vec = [] with open('bert-model/TF.csv') as TF: for idx, line in enumerate(tqdm(TF)): term = line.split(',')[0] vocab[term] = idx id2vocab[idx] = term vec.append(word2vec[term]) del word2vec model = torch.load('checkpoint-generator-pretrain/bert-LanGen-last.pt')['full_model'] POS = Postagger() POS.load('bert-model/ltp_data_v3.4.0/pos.model')