def train_save_word_embeddings(corpus_file_path, algo, vector_dim, word2vec_format=False, save_model=False, architecture='cbow', train_algorithm='negative', workers=4): """architecture: 'skim-gram' or 'cbow'. train_algorithm: 'softmax' or 'negative'""" sentences = MySentences(corpus_file_path) arch = 1 if architecture == 'skip-gram' else 0 train = 1 if train_algorithm == 'softmax' else 0 print('Training %s with size %d' % (algo, vector_dim)) if algo == 'word2vec': model = Word2Vec(sentences=sentences, size=vector_dim, workers=workers, sg=arch, hs=train) else: # fasttext model = FastText(sentences=sentences, size=vector_dim, workers=workers, sg=arch, hs=train) print('Done!') s = algo + '_' for name in corpus_file_path: s += name.split('/')[-1].split('.')[0] + '_' filename = "{0}{1}_{2}_{3}".format(s, vector_dim, architecture, train_algorithm) if save_model: print('Saving model in {0}.model'.format(filename)) model.save(os.path.join(word2vec_path, "{}.model".format(filename))) if word2vec_format: print('Saving word embeddings in original C word2vec (.txt) format in {}.txt'.format(filename)) model.wv.save_word2vec_format(os.path.join(word2vec_path, "{}.txt".format(filename))) print('Saving word embeddings in {0}.kv'.format(filename)) model.wv.save(os.path.join(save_path(algo), "{}.kv".format(filename))) print('Saved!') return model
def fastText(root, sentences, embedding_size=200, model_type=0, min_count=1, workers=4): """ Trains and saves a fastText model on a given list of sentences Params: sentences: list of tokenized sentences embedding_size: size of the embeddings model_type: int, 0: CBOW, 1: skip-gram min_count: worker: number of workers for parallelization The saved file stores the model in a format compatible with the original word2vec implementation. https://groups.google.com/forum/#!topic/gensim/RhiwLU0vP1A """ logger.info("Train the fasttext model") model = FastText(sentences, min_count=min_count, workers=workers, sg=model_type, size=embedding_size) logger.info("Save the trained model") model.save(root+'fasttext_model') logger.info("Save the model in word2vec format") fname = 'fasttext_word2vecFormat' model.wv.save_word2vec_format(root+fname)
class ModelFastText: def __init__(self, path, existModel=False): if existModel: self.loadModel(path) else: self.createModel(path) def createModel(self, pathTrain, size=300, min_count=50, sg=1, workers=8, progress_per=50000): self.model = FT_gensim(size=300, min_count=50, sg=1, workers=8) sentences = datapath(pathTrain) self.model.build_vocab(corpus_file=sentences, progress_per=50000) def loadModel(self, path): self.model = FT_gensim.load(path) def trainModel(self, pathTrain, epochs=5, compute_loss=False): sentences = datapath(pathTrain) self.model.train(sentences, epochs=5, total_examples=self.model.corpus_count, compute_loss=False) def saveModel(self, nameFile): self.model.save(nameFile+".model") def getSimilar(self, word): return self.model.wv.most_similar(word, topn=50)
def build_fast_text_model(fasttext_entity_path): # build fastText fasttext_params = { "hs": 1, "window": 10, "min_count": 1, "workers": 7, "min_n": 1, "max_n": 10, } print("building corpus") entity_corpus = [entity for entity in entity_generator(entity_collection)] fasttext_entity = FastText(**fasttext_params) print("count corpus") fasttext_entity.build_vocab(sentences=entity_corpus) total_examples = fasttext_entity.corpus_count print("train fasttext") fasttext_entity.train(sentences=entity_corpus, total_examples=total_examples, epochs=5) print("saving fasttext") fasttext_entity.save(fasttext_entity_path) return fasttext_entity
def generate_model(): comments = [] for comment in unlabeledData["comment"]: comments += to_separate_sentences(comment) print("# of comments taken for building the model: " + str(len(comments))) # num_features = 1000 # Word vector dimensionality # context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words min_word_count = 1 # Minimum word count - if not occurred this much remove num_workers = 4 # Number of threads to run in parallel model = FastText(comments, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling, sg=1, iter=50) # model.init_sims(replace=True) # If you don't plan to train the model any further model.save(fasttext_model) check_model_qulity(model, 'නැහැ') return
def main(): skills_train = [] for each_skill in data["clean_skills"]: if each_skill != ['']: skills_train.append(each_skill) embedding_size = 60 window_size = 40 min_word = 5 down_sampling = 1e-2 model = FastText(skills_train, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100) model.init_sims(replace=True) print(model) fname = get_tmpfile("fasttext.model") model.save(fname)
class FastTextTrainer(object): """ Perform training and save gensim FastText """ def __init__(self, min_count=2, size=200, workers=4, window=3, iter=10): self.min_count = min_count self.size = size self.workers = workers self.window = window self.iter = iter self.model = None def train(self, corpus, sg=0, callbacks=None): self.model = FastText(corpus, callbacks=callbacks, min_count=self.min_count, size=self.size, workers=self.workers, window=self.window, iter=self.iter, sg=sg) def save(self, filename): self.model.save(filename) def get_model(self): return self.model def load_model(self, filename): return FastText.load(filename) def load_google_model(self, filename): return FastText.load_fasttext_format(filename) def retrain(self, model, corpus, sg=0, iter=10, callbacks=None): model.train(corpus, total_examples=model.corpus_count, epochs=iter, callbacks=callbacks) self.model = model
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def build_model(data_path, sg=0, hs=0, negative=5, size=100, min_count=5, workers=8, window=5, model_type='word2vec'): if model_type == 'word2vec': model = Word2Vec(LineSentence(data_path), sg=sg, workers=workers, min_count=min_count, size=size, hs=hs, negative=negative, window=window) elif model_type == 'fasttext': model = FastText(LineSentence(data_path), sg=sg, workers=workers, min_count=min_count, size=size, hs=hs, negative=negative, window=window) else: raise Exception('Model type is not supported.') vectors = model.wv.vectors vocabulary = model.wv.index2word with open('index_to_vectors.txt', 'w+', encoding='utf-8') as vectors_file: content = '\n'.join([ '{}\t{}'.format(i + 1, ','.join([str(item) for item in vector])) for i, vector in enumerate(vectors) ]) content = '{}\t{}\n'.format(0, ','.join(['0'] * size)) + content vectors_file.write(content) with open('data/word_to_vectors.txt', 'w+', encoding='utf-8') as vectors_file: content = '\n'.join([ '{}\t{}'.format(vocabulary[i], ','.join([str(item) for item in vector])) for i, vector in enumerate(vectors) ]) content = '{}\t{}\n'.format('<pad>', ','.join(['0'] * size)) + content vectors_file.write(content) with open('data/vocabulary.txt', 'w+', encoding='utf-8') as vocab_file: content = '\n'.join([ '{}\t{}'.format(i + 1, str(vocab)) for i, vocab in enumerate(vocabulary) ]) content = '{}\t{}\n'.format(0, '<pad>') + content vocab_file.write(content) model_path = 'word2vec.model' model.save(model_path) return model_path
def test_online_learning_after_save(self): model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) self.assertTrue(len(model_neg.wv.ngrams), 271)
def fasttext_train_model(): # read corpus with open(corpus_dir) as fp: corpus = fp.readlines() # list of string model = FastText(size=embed_size, window=3, min_count=1) # instantiate model.build_vocab(sentences=corpus) model.train(sentences=corpus, total_examples=len(corpus), epochs=epoch) # train model.save(model_dir) # save model
def create_FastText_model(skip_gram, tokenized_sentences, model_path): try: model = FastText.load(model_path) except: model = FastText(min_count=1, window=5, sg=skip_gram) model.build_vocab(sentences=tokenized_sentences) model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=10) model.save(model_path) return model
def train_word_vectors(modeltype, sentences, path, sg, size, mincount): print(path) os.makedirs(os.path.dirname(path), exist_ok=True) if modeltype == 'word2vec': model = Word2Vec(sentences, sg=sg, size=size, min_count=mincount) model.save(path) elif modeltype == 'fasttext': model = FastText(sentences, sg=sg, size=size, min_count=mincount) model.save(path) else: raise Exception("%s is not valid model type" % modeltype)
def test_persistence(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def test_persistence(self): model = FT_gensim(sentences, min_count=1) model.save(testfile()) self.models_equal(model, FT_gensim.load(testfile())) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(testfile()) loaded_wv = FastTextKeyedVectors.load(testfile()) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
def generate_model(lang): model_gensim = FT_gensim(size=300) # build the vocabulary model_gensim.build_vocab(corpus_file='embedding/corpus_' + lang) # train the model model_gensim.train(corpus_file='embedding/corpus_' + lang, epochs=model_gensim.epochs, total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words) model_gensim.save('embedding/fasttext_' + lang + '.vec')
def main(): print('Instantiating the model') model = FT_gensim(size=100, window=5, min_count=5) # instantiate the model print('Building the vocabulary') model.build_vocab(sentences=MyIter()) total_examples = model.corpus_count print('Training the model') model.train(sentences=MyIter(), total_examples=total_examples, epochs=5) # train the model ## Save the model (can be loaded using gensim) print('Saving the model to specified filepath') save_file = sys.argv[2] model.save(save_file)
def test_norm_vectors_not_saved(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(tmpf) loaded_model = FT_gensim.load(tmpf) self.assertTrue(loaded_model.wv.vectors_norm is None) self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None) wv = model.wv wv.save(tmpf) loaded_kv = FastTextKeyedVectors.load(tmpf) self.assertTrue(loaded_kv.vectors_norm is None) self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
def word_embedding(tokens): """ 訓練詞向量並儲存 Args: tokens (list): 斷詞的tokens """ # input must be 2D even only one data # tokens = [['xx', 'xxx']] model = FastText([tokens], min_count=1, size=3) get_vector = model.wv['半導體'] model.save('./data/wb.model') print(get_vector) print(model.wv.vocab)
def test_persistence_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def createFastTextModel(data, isSG, vectorSize, maxNgram, modelFilePath): # train model model_gensim = FT_gensim(sg=isSG, size=vectorSize, min_count=1, min_n=1, max_n=maxNgram) # build the vocabulary model_gensim.build_vocab(data) # train the model model_gensim.train(data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) #save model_gensim.save(modelFilePath)
def test_online_learning_after_save_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def embedding_creation(args, full_article): word_punctuation_tokenizer = nltk.WordPunctTokenizer() word_tokenized_corpus = [ word_punctuation_tokenizer.tokenize(sent) for sent in full_article ] embedd_model = FastText(word_tokenized_corpus, size=args.embedding_size, window=args.window_size, min_count=args.min_word, sample=args.down_sampling, sg=0, iter=50) print('Finished and saving model at location', args.embeddingPath) embedd_model.save(args.embeddingPath + args.embedding_Model) return
def fast_text_trainer(df, model_path): list_of_tokens = list(df["description"]) if isinstance(list_of_tokens[0], str): # list_of_tokens = [literal_eval(x) for x in list_of_tokens] tokenized_data = df[['description']].applymap(lambda s: word_tokenize(s)) list_of_tokens = list(tokenized_data["description"]) start_time = time.time() model = FastText(sentences=list_of_tokens, sg=1, size=60, window=5, min_count=1, workers=3, iter=30) print("Time taken to train the fast_text model: " + str(int((time.time() - start_time) / 60)) + ' minutes\n') model.save(model_path)
def train_fasttext_model(output_model_path, iter_docs, size=300, window=8, min_count=5, sg=1, epoch=5): """ Parameters ---------- output_model_path : string path of fastText model iter_docs : iterator iterator of documents, which are raw texts size : int size of word vector window : int window size of word2vec min_count : int minimum word count sg : int word2vec training algorithm (1: skip-gram other:CBOW) epoch : int number of epochs """ logging.info("build vocabularies") model = FastText(size=size, window=window, min_count=min_count, sg=sg, workers=multiprocessing.cpu_count()) model.build_vocab(iter_docs()) logging.info("train fasttext") model.train(iter_docs(), total_examples=model.corpus_count, epochs=epoch) model.init_sims(replace=True) logging.info("save model") p = Path(output_model_path) if not p.parent.exists(): p.parent.mkdir(parents=True) model.save(output_model_path) logging.info("done.")
def instantiateEmbenddingMatrix(corpus, tokenizer, vocabulary_size,sequence_length, feature_size , window_context = 3, min_word_count = 10, sample = 1e-3, sg=0, overwrite=False, load=True): if load == True: try: embedding_matrix = None print("Loading embedding matrix...") embedding_matrix = np.genfromtxt('embedding.csv', delimiter=',') ft_model = FastText.load("ft_model.model") except: embedding_matrix = None pass else: embedding_matrix = None if embedding_matrix is None or overwrite or load == False: # Word vector dimensionality # Context window size # Minimum word count # Downsample setting for frequent words # sg decides whether to use the skip-gram model (1) or CBOW (0) ft_model = FastText(corpus,min_n=0,max_n=3 , size=feature_size, window=window_context,min_count=min_word_count,sample=sample, sg=sg, iter=1000) ft_model.save("ft_model.model") print('Preparing embedding matrix...') words_not_found = [] nb_words = vocabulary_size word_index = tokenizer.word_index embedding_matrix = np.zeros((nb_words, feature_size)) for word, i in word_index.items(): if i >= nb_words: continue embedding_vector = ft_model.wv.get_vector(word) if (embedding_vector is not None) and len(embedding_vector) > 0: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector else: words_not_found.append(word) print(embedding_matrix.shape) print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) np.savetxt('embedding.csv', embedding_matrix, delimiter=',') return embedding_matrix,ft_model
def BuildFastText(reviews, label): #input: sentences da dc tokenize thanh list model = FastText(size=200, window=3, min_count=4) model.build_vocab(sentences=reviews) model.train(sentences=reviews, total_examples=len(reviews), epochs=50) # fname = get_tmpfile("Model/fasttext.model") model.save("Model/fasttext.bin") print("Save model done!") a = 0 b = 0 dataf = list() for review in reviews: # print(a) len_sen = len(review) # print(type(len_sen)) try: vectors = model.wv[review] # print(type(vectors)) sumvec = 0 for i in range(0, len_sen): sumvec = sumvec + vectors[i] sumvec = sumvec / len_sen # print(type(sumvec)) # print(sumvec) dataf.append(sumvec) except: sumvec = 0 # print(a) # print(review) b = b + 1 if (label[a] == 0): sumvec = model.wv["positive"] print(type(sumvec)) if (label[a] == 1): sumvec = model.wv["negative"] print(type(sumvec)) dataf.append(sumvec) # print(sumvec) # print(type(sumvec)) # print(len(vectors)) # if (len(vectors) == 0): # print(a) # a = a + 1 #5953 #6253 #11001 #11801 #12051 #13937 #14005 dataf = np.array(dataf) return dataf
def test_online_learning_after_save(self): model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) self.assertTrue(len(model_neg.wv.ngrams), 271)
def fast_text_training(merger_seg_path, model_path, min_count=5, embeding_size=200): ''' 使用fast_text方法训练 :param merger_seg_path: :param min_count: :param embeding_size: :return: ''' model = FastText(sentences=LineSentence(merger_seg_path), workers=8, min_count=5, size=200) model.save(model_path) print("奇瑞最相近的词汇:\n", model.wv.most_similar(['奇瑞'], topn=10))
def create_FastText_model(model_path, skip_gram, epochs_count, tokenized_sentences): try: model = FastText.load(model_path) except: model = FastText(min_n=1, hs=True, alpha=0.1, min_alpha=0.1, sg=skip_gram) model.build_vocab(sentences=tokenized_sentences) model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), epochs=epochs_count) model.save(model_path) return model
def main(args): sentences = gensim.models.word2vec.Text8Corpus(args.corpus) my_model = FastText(sentences=sentences, size=int(args.size), window=int(args.window), min_count=int(args.min_count), workers=int(args.workers), alpha=float(args.alpha), sample=float(args.subsample), negative=int(args.negative), sorted_vocab=True, iter=int(args.epochs)) my_model.save( os.path.join(file_dir, '../allnews_am/models/', args.model_name)) analogy_file = os.path.join(file_dir, '../allnews_am/data/yerevann_analogies.txt') my_model.wv.evaluate_word_analogies(analogy_file)