def load(self, embedding_fname, embedding_url=None, *args, **kwargs): """ Method initializes dict of embeddings from file Args: fname: file name Returns: Nothing """ if not embedding_fname: raise RuntimeError('No pretrained fasttext intent_model provided') fasttext_model_file = embedding_fname if not Path(fasttext_model_file).is_file(): emb_path = embedding_url if not emb_path: raise RuntimeError( 'No pretrained fasttext intent_model provided') embedding_fname = Path(fasttext_model_file).name try: download_path = './' download_untar(embedding_url, download_path) except Exception as e: raise RuntimeError( 'Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e) self.model = FastText.load_fasttext_format(fasttext_model_file) return
def prepare(self): self.word2idx = defaultdict(int) # to make sure start_symbol, end_symbol, pad, and unk will be included self.word2idx[self.START_SYMBOL] = self.word2idx[ self.END_SYMBOL] = self.word2idx[self.UNK] = self.word2idx[ self.PAD] = self.min_word_freq for dataset_type in ["train", "val"]: caps = dset.CocoCaptions( root=FilePathManager.resolve(f'data/{dataset_type}'), annFile=FilePathManager.resolve( f"data/annotations/captions_{dataset_type}2017.json"), transform=transforms.ToTensor()) for _, captions in caps: for capt in captions: tokens = self.tokenize(capt) for token in tokens: self.word2idx[token] += 1 temp = {} embeddings = {} fast_text = FastText.load( FilePathManager.resolve("data/fasttext.model"), mmap="r") for k, v in self.word2idx.items(): if v >= self.min_word_freq: temp[k] = len(temp) embeddings[k] = fast_text[k] if k in fast_text else fast_text[ self.UNK] self.word2idx = temp # swap keys and values self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) self.fast_text = embeddings
def get_embedding_model(): global _embedding_model if not _embedding_model: try: _embedding_model = FastText.load_word2vec_format(project.aux_dir + FASTTEXT_VOCAB) except IOError: _embedding_model = None return _embedding_model
def fit(self, X, y=None): X.to_csv(self.inputFile, index=False) corpus_file = datapath(self.inputFile) self.model_wrapper = FT_wrapper.train(self.ft_home, self.inputFile, model=self.model, size=self.size, word_ngrams=self.word_ngrams) return self
def load_fasttext_format(cls, *args, **kwargs): """Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with the original fasttext implementation. Parameters ---------- fname : str Path to the file. """ return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return tmpf = get_tmpfile('gensim_fasttext.tst') model_wrapper = FT_wrapper.train( ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=tmpf, model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0] ).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def make_w2v(vocab): d = {} if opt.lang == 'en_w2v': model = KeyedVectors.load_word2vec_format( '../../../GoogleNews-vectors-negative300.bin', binary=True) if opt.lang == 'en_fast': model = KeyedVectors.load_word2vec_format( '../../../wiki-news-300d-1M.vec') if opt.lang == 'es': model = FastText.load_fasttext_format('../../../cc.es.300.bin') if opt.lang == 'fr': model = FastText.load_fasttext_format('../../../cc.fr.300.bin') for i in range(4, vocab.size()): word = vocab.idxToLabel[i] #if opt.lang == 'en_w2v': #if model.emb(word)[0] != None: #if model.emb(word)[0] != None: #d[i] = model.emb(word) #d[i] = model[word] if word in model: d[i] = model[word] return d
def reduce_fasttext_embedding(fasttext_path, words): model = FT_wrapper.load(fasttext_path) print(model) word_to_embedding = {} coverage = 0 for word in words: key = word.lower() if word in model: coverage = coverage + 1 word_to_embedding[key] = model[key] else: word_to_embedding[key] = None print('fastText cache: {}/{} words'.format(coverage, len(words))) return word_to_embedding
def make_embedding_matrix(train_captions): tokenizer.fit_on_texts(train_captions) model = FastText.load_fasttext_format(cfg.fasttext) #---------embedding matrix 만듬-------- vocab_size = len(tokenizer.word_index) embedding_matrix = np.random.random((vocab_size, 256)) for word,i in tokenizer.word_index.items(): # 1부터 시작함 try: embedding_vector = model[word] except: #min count 이하 등장 단어 #print(word, 'not found') pass if embedding_vector is not None: embedding_matrix[i-1] = embedding_vector return embedding_matrix
def load_pretrained_fasttext(): # Set FastText home to the path to the FastText executable ft_home = '/home/dev/fastText/fasttext' # Set file names for train and test data train_file = config.pos_path # Use FaceBook Corpus #model = FastText.load_word2vec_format('/home/dev/wiki.ko.vec') model = FastText.train(ft_home, train_file, min_count=1) print(model) result = model.most_similar(positive=['김승우']) print(result) return model
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def embedding_weights_load(words_map, embedding_weights_path): pre_trained_embedding = None try: model = FastText.load_fasttext_format(embedding_weights_path) pre_trained_embedding = "bin" except: print("fastText binary file (.bin) is not found!") if os.path.exists("./Word_embedding/wiki.en.vec"): print("Using wikipedia(en) pre-trained word vectors.") else: print("Downloading wikipedia(en) pre-trained word vectors.") chakin.download(number=2, save_dir="./Word_embedding") print("Loading vectors...") if os.path.exists("./Word_embedding_model.pkl"): with open("./Word_embedding_model.pkl", mode="rb") as f: model = pickle.load(f) else: model = KeyedVectors.load_word2vec_format( './Word_embedding/wiki.en.vec') with open("Word_embedding_model.pkl", mode="wb") as f: pickle.dump(model, f) pre_trained_embedding = "txt" vocab_size = len(words_map) word_dimension = model['a'].shape[0] w = np.zeros((vocab_size, word_dimension), dtype=np.float32) for k, v in words_map.items(): word = k word_number = v try: w[word_number][:] = model[word] except KeyError as e: if pre_trained_embedding == "bin": w[word_number][:] = model.seeded_vector(word) else: np.random.seed(word_number) w[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) return w
def embedding_weights_load(words_map, embeddingWeights_path): pre_trained_embedding = None try: model = FastText.load_fasttext_format( embeddingWeights_path) #binファイルがある場合はそちらを読み込む pre_trained_embedding = "bin" except: print("fastText binary file (.bin) is not found!" ) #ない場合はwikipediaの分散表現を使用する if os.path.exists("./Word_embedding/wiki.en.vec"): print("Using wikipedia(en) pre-trained word vectors.") else: print("Downloading wikipedia(en) pre-trained word vectors.") chakin.download(number=2, save_dir="./Word_embedding") print("Loading vectors...") model = KeyedVectors.load_word2vec_format( './Word_embedding/wiki.en.vec') pre_trained_embedding = "txt" vocab_size = len(words_map) word_dimension = model['a'].shape[0] #次元数を取得 W = np.zeros((vocab_size, word_dimension), dtype=np.float32) #分散表現を格納するための行列 for k, v in words_map.items(): #kには単語,vには単語ID word = k word_number = v #モデル中に存在しないチャンゴがある場合には、その単語の分散表現は乱数となる try: W[word_number][:] = model[word] except KeyError as e: if pre_trained_embedding == "bin": W[word_number][:] = model.seeded_vector(word) else: np.random.seed(word_number) W[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) return W
def load(self): self.logger.info("Loading model: {}".format(self.model_path)) self.model = FT_wrapper.load(self.model_path) self.logger.info("Model loaded")
def get_fasttext(): global _fasttext if _fasttext is None: log.debug("Loading fasttext model..") _fasttext = FastText.load_fasttext_format(FASTTEXT_PATH) return _fasttext
# print('Training gensim fasttext model...') # tstart = time.time() # model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) # tend = time.time() # print('Time elapsed for training wrapper model %.2f' % (tend - tstart)) # print(model_gensim) with open(data_dir + 'questions_file.txt', 'w') as fw: for line in train_data: fw.write(line + '\n') print('Text saved to %s' % (data_dir + 'questions_file.txt')) # train the model print('Training wrapper fasttext model...') tstart = time.time() model_wrapper = FT_wrapper.train(ft_home, data_dir + 'questions_file.txt') tend = time.time() print('Time elapsed for training wrapper model %.2f' % (tend - tstart)) print(model_wrapper) # # saving a model trained via Gensim's fastText implementation # print('Loading fasttext gensim model...') # model_gensim.save(output_dir + 'saved_model_gensim') # loaded_model = FT_gensim.load(output_dir + 'saved_model_gensim') # print(loaded_model) # saving a model trained via fastText wrapper print('Loading fasttext wrapper model...') model_wrapper.save(output_dir + 'saved_model_wrapper') loaded_model = FT_wrapper.load(output_dir + 'saved_model_wrapper') print(loaded_model)
def load(self, fname): self.fasttext = FastText.load_fasttext_format(fname)
# In[1]: from gensim.models import KeyedVectors from gensim.models.wrappers.fasttext import FastText # In[2]: ## load model word2vec = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/word2vec200.model" ) word2vec_weighted = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/word2vec_weighted.model" ) fasttext = FastText.load_fasttext_format( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/fasttext_model_200dim" ) fasttext_weighted = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/fasttext_weighted.model" ) poincare_vec = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/poincare_vec.model" ) poincare_vec_weighted = KeyedVectors.load( "../japanese-dataset/livedoor-news-corpus/model/vector-response-test/poincare_vec_weighted.model" ) # In[3]: len(word2vec.most_similar("独身"))
def load_fasttext_format(cls, *args, **kwargs): return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
start = time.time() clf = lgb.LGBMClassifier(objective="multiclass") clf.fit(plain_fasttext, train["class"]) Y_true, Y_pred = test["class"], clf.predict(plain_fasttext_test) print("Report") print(classification_report(Y_true, Y_pred, digits=6)) print("Accuracy: ", clf.score(plain_fasttext_test, test["class"])) print("Time taken:", time.time() - start, "\n") # In[ ]: ## SCDV based fasttext from gensim.models.wrappers.fasttext import FastText fasttext_model_200 = FastText.load_fasttext_format( '../japanese-dataset/livedoor-news-corpus/for-fasttext/fasttext_model_200dim' ) # In[ ]: # Get wordvectors for all words in vocabulary. word_vectors = fasttext_model_200.wv.syn0 # Set number of clusters. num_clusters = 60 # Uncomment below line for creating new clusters. idx, idx_proba = cluster_GMM(num_clusters, word_vectors) # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments. # idx_name = "gmm_latestclusmodel_len2alldata.pkl" # idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) print(model_gensim) # ### Using wrapper for fastText's C++ code # In[*] from gensim.models.wrappers.fasttext import FastText as FT_wrapper # Set FastText home to the path to the FastText executable ft_home = '/usr/local/bin/fasttext' # train the model model_wrapper = FT_wrapper.train(ft_home, lee_train_file) print(model_wrapper) # ### Training hyperparameters # Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the folllowing parameters from the original word2vec - # - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`) # - size: Size of embeddings to be learnt (Default 100) # - alpha: Initial learning rate (Default 0.025) # - window: Context window size (Default 5) # - min_count: Ignore words with number of occurrences below this (Default 5) # - loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`) # - sample: Threshold for downsampling higher-frequency words (Default 0.001) # - negative: Number of negative words to sample, for `ns` (Default 5) # - iter: Number of epochs (Default 5)
def embdReader(embd_path, embd_dim, word_index, max_nb_words, fasttext_source='', ft_dim=0, ft_home='/data2/tonyq/fastText/fasttext', output_dir='/data2/tonyq/quora-output/', skip_header=False, initializer='glorot'): ######################################## ## index word vectors ######################################## if not embd_path == '': logger.info('Indexing word vectors...') embeddings_index = {} with open(embd_path, 'r', encoding='utf8') as f: if skip_header or embd_path.endswith('.vec'): next(f) for line in tqdm(f): values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs logger.info('Found %d word vectors in embedding file.' % len(embeddings_index)) ######################################## ## prepare fasttext ######################################## if not fasttext_source == '': from gensim.models.wrappers.fasttext import FastText as FT_wrapper if fasttext_source.endswith('.bin'): loaded_model = FT_wrapper.load(fasttext_source) print(loaded_model) else: _, train_question1, train_question2 = get_pdTable(fasttext_source, notag=True) train_question1, train_maxLen1 = text_cleaner(train_question1) train_question2, train_maxLen2 = text_cleaner(train_question2) train_data = train_question1 + train_question2 print('Train data lines %d' % len(train_data)) with open(output_dir + 'questions_file.txt', 'w') as fw: for line in train_data: fw.write(line + '\n') print('Text saved to %s' % (output_dir + 'questions_file.txt')) # train the model print('Training wrapper fasttext model...') tstart = time.time() model_wrapper = FT_wrapper.train(ft_home, output_dir + 'questions_file.txt', size=ft_dim) tend = time.time() print('Time elapsed for training wrapper model %.2f' % (tend - tstart)) print(model_wrapper) # saving a model trained via fastText wrapper print('Loading fasttext wrapper model...') model_wrapper.save(output_dir + 'saved_model_wrapper.bin') ######################################## ## prepare embeddings ######################################## logger.info('Preparing embedding matrix based on given word list...') nb_words = min(max_nb_words, len(word_index)) + 1 w2v_oov = 0 ft_oov = [] if initializer == 'zero': # zero initialization of embedding matrix embedding_matrix = np.zeros((nb_words, embd_dim + ft_dim)) elif initializer == 'glorot': # glorot uniform initialization of embedding matrix scale = 1 / nb_words # fan_in # scale = 1 / (embd_dim + ft_dim) # fan_out limit = np.sqrt(3. * scale) embedding_matrix = np.random.uniform(low=-limit, high=limit, size=(nb_words, embd_dim + ft_dim)) else: raise NotImplementedError reverseDict = [''] * nb_words for word, i in tqdm(word_index.items()): if not embd_path == '': embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i][:embd_dim] = embedding_vector reverseDict[i] = word else: reverseDict[i] = '<' + word + '>' w2v_oov += 1 if not fasttext_source == '': try: embedding_matrix[i][embd_dim:] = model_wrapper[word] reverseDict[i] = word except KeyError: reverseDict[i] = '<' + word + '>' ft_oov.append(word) logger.info('Word embeddings shape: %r (%d+%d)' % (embedding_matrix.shape, embd_dim, ft_dim)) if not embd_path == '': logger.info('Word2Vec null embeddings: %d' % w2v_oov) if not fasttext_source == '': logger.info('FastText null embeddings: %d' % len(ft_oov)) logger.info('FastText OOV: %r' % ft_oov) return embedding_matrix, reverseDict
def train_wikipedia(ft_home, input_path, output_path, iterations=5, min_n=3, max_n=3): model = FT_wrapper.train(ft_home, input_path, min_n=min_n, max_n=max_n, iter=iterations) model.save(output_path)
# create vector v = np.array(v) print(np.shape(v)) text_len = np.array([len(s) for s in text]).reshape(len(text), 1) X = np.concatenate((text_len, v), axis=1) print(np.shape(X)) elif mode == 'ft': # -------------------------------------------------------------------- # FastText print('generating fasttext') text = [clean_text(s).split() for s in text] dim = 200 model = FastText(size=dim, iter=1) #model.build_vocab(text) model.train(text) # then calculate word vector per paragraph print('generating paragraph vectors') v = [] for s in text: ww = np.zeros((dim)) n = 0 for k, w in enumerate(s): if w in model.wv: ww += model.wv[w] n += 1 if n > 0: v.append(ww / n) else:
import gensim from gensim.models.wrappers.fasttext import FastText from file_path_manager import FilePathManager if __name__ == '__main__': model = FastText.load_fasttext_format(FilePathManager.resolve("data/wiki.en")) model.save(FilePathManager.resolve("data/fasttext.model"))
] and not c[1] in ["非自立", "代名詞"]: words.append(cols[0]) return words questions_src = [] questions = [] answers = [] for line in open(args.input, "r", encoding="utf-8", errors="ignore"): cols = line.strip().split('\t') #print(cols[0]) questions_src.append(cols[0]) questions.append(wakati(cols[0])) answers.append(cols[1]) model = FastText.load_fasttext_format(args.model) def part_minus(v): #正と負で別のベクトルにする tmp_v = np.zeros(DIM * 2) for i in range(DIM): if v[i] >= 0: tmp_v[i] = v[i] else: tmp_v[i * 2] = -v[i] return tmp_v questions_vec = [] tf_vecs = []
def train_fasttext(corpus_file, fasttext_path=None, save="../data/embeddings/", dim=300): """ Input: corpus_file: the path to the file that has the embedding training dataset. fasttext_path: path to the FastText executable. If not given, we use the gensim reimplementation instead. save: the directory where the embeddings will be saved. dim: number of dimensions for the embeddings. Output: A file with the embeddings both in gensim format and in word2vec format. It also returns the model itself. """ print("Generating embeddings...") if fasttext_path is not None: # Run this if FastText is installed print("FastText wrapper loaded") # Set FastText home to the path to the FastText executable ft_home = fasttext_path print("\nCreating embeddings model...") # train the model model = FT_wrapper.train(ft_home, corpus_file, sg=1, size=dim) print("Model created and trained") else: # Run this if using windows or if FastText is not installed print("Gensim implementation loaded") print("\nCreating embeddings model...") model = FT_gensim(size=dim, sg=1) print("Model created") # build the vocabulary print("\nGenerating vocabulary...") model.build_vocab(corpus_file=corpus_file) print("Vocabulary generated") # train the model print("\nTraining embeddings model") model.train(corpus_file=corpus_file, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) print("Model trained:") print(model, "\n") # saving a model if save is not None: path = save + "ft_embeddings." + str(dim) model.save(path + ".model") model.wv.save_word2vec_format(path + ".vec") gg = open(path + ".txt", 'w', encoding="utf8") for token in model.wv.vocab.keys(): string = token for value in model.wv[token]: string += " " + str(value) gg.write(string + '\n') gg.close() print("Embeddings saved\n") print("") return model