def testLoadOldModel(self): """Test loading fasttext models from previous version""" model_file = 'fasttext_old' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) self.assertTrue(model.vocabulary.cum_table.shape == (12, )) self.assertEqual(len(model.wv.hash2index), 202) self.assertTrue(model.wv.vectors_vocab.shape == (12, 100)) self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100)) # Model stored in multiple files model_file = 'fasttext_old_sep' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) self.assertTrue(model.vocabulary.cum_table.shape == (12, )) self.assertEqual(len(model.wv.hash2index), 202) self.assertTrue(model.wv.vectors_vocab.shape == (12, 100)) self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))
def load_embeddings(self): self.input_dim = 0 for name, e_conf in self.config['w2v_embeddings'].items(): if not e_conf['enabled']: continue print("Loading w2v %s embedding ..." % name) self.w2v_embeddings[name] = Word2Vec.load(e_conf['path']) e_dim = self.w2v_embeddings[name].vector_size print("w2v %s embedding dim %s." % (name, e_dim)) self.input_dim += e_dim * len(e_conf['attributes']) for name, e_conf in self.config['ft_embeddings'].items(): if not e_conf['enabled']: continue print("Loading fasttext %s embedding ..." % name) self.ft_embeddings[name] = FastText.load(e_conf['path']) e_dim = self.ft_embeddings[name].vector_size print("fasttext %s embedding dim %s." % (name, e_dim)) self.input_dim += e_dim * len(e_conf['attributes']) for i, (attr, dim) in enumerate(self.config['indexed_embeddings']): print(attr, "indexed embedding dim", dim) self.indexed_embedding_dicts[attr] = self.get_categorical_embedding_dict(self.indexed_features[attr], dim) self.input_dim += dim print('position_embedding_dim %s.' % self.config['position_embedding_dim']) self.position_embedding_dict = self.get_position_embedding_dict(self.window_size, self.config['position_embedding_dim']) self.input_dim = self.input_dim + self.config['position_embedding_dim']
def load_fasttext_model(model_name: str) -> FastText: if normpath((model_name)).lower().endswith('.bin'): fasttext_model = load_facebook_model(datapath(normpath(model_name))) else: fasttext_model = FastText.load(datapath(normpath(model_name))) fasttext_model.init_sims(replace=True) return fasttext_model
def get_embeddings(self, corpus_path: str, model_path: str = 'models', transfer: bool = False, overwrite: bool = False): """ Build FastText embeddings for a given corpus if no embeddings exist yet or existing embeddings are to be overwritten. Loads and returns existing embeddings if they can be detected. TODO: implement that last bit. :param corpus_path: The path to the text corpus used to generate embeddings. :param model_path: The path where the word embeddings are to be stored. :param transfer: Encodes whether the new embeddings should be added "on top" of existing embeddings. :param overwrite: If a trained model already exists but the user still wants to train one from scratch, this is true. """ if overwrite or len(os.listdir("models/transfer_learned")) == 0: print("Loading pretrained model...") model = load_facebook_model("models\wiki.en.bin") model.build_vocab(sentences=self.corpus, update=True) print( "Successfully loaded pretrained model!\nStart transfer-learning..." ) model.train(sentences=self.corpus, total_examples=len(self.corpus), epochs=5) print("Successfully finished transfer learning!") model.save("models/transfer_learned/big_model.model") else: print("Loading word embeddings...") model = FastText.load("models/transfer_learned/big_model.model") print("Word embeddings loaded!") return model
def load_ft_model(modelPath): print("Loading the model...") #path=os.getcwd()+modelPath model = FastText.load(modelPath) print("Loading done.") print_model_info(modelPath,model) return model
def init_app(app): dimension = 100 annoy_index = AnnoyIndex(dimension) annoy_index.load(ANNOY_INDEX_PATH) fasttext_entity = FastText.load(FASTTEXT_ENTITY) @app.route("/", methods=['GET']) def front_page(): return render_template("search.html") @app.route("/search/", methods=['POST']) def search(): request_body = request.get_json() return get_search_results(request_body, fasttext_entity, annoy_index, annoy_index_collection) @app.route("/detail/", methods=['POST']) def detail(): request_body = request.get_json() return get_detail(request_body, quote_collection, entity_keywords_collection, fasttext_entity, annoy_index, annoy_index_collection) @app.route("/top_people/") def top_people(): return jsonify(top_people=["person1", "person2", "person3"]) @app.route('/health/', methods=['GET']) def health(): return Response('OK', mimetype='text/plain')
def test_continued_training_from_existing_model(): """ Test training the model from scratch """ save_model_pth = 'temp.model' # Create a very small model, resulting in a small vocabulary reindex(num_sentences=5) fasttext_trainer = FastTextTrainer(INDEX_NAME, FIELD_NAME, save_model_pth=save_model_pth, num_training_epochs=1, init_from_pretrained=False) fasttext_trainer.train() assert os.path.exists(save_model_pth) previous_vocab = set(fasttext_trainer.model.wv.vocab.keys()) # Re-index more data (resulting in larger vocabulary) reindex(num_sentences=50, create_new_index=False) # Continue training the model fasttext_trainer = FastTextTrainer(INDEX_NAME, FIELD_NAME, save_model_pth=save_model_pth, num_training_epochs=1, init_from_pretrained=True, pretrained_path=save_model_pth) fasttext_trainer.train() # Validate the updated model updated_model = FastText.load(save_model_pth) new_vocab = set(updated_model.wv.vocab.keys()) assert len(new_vocab) > len(previous_vocab) assert previous_vocab.issubset(new_vocab) # Remove the created model files os.system('rm {}*'.format(save_model_pth))
def filter_skill(data, user): #Merge and pass data_pass = data[data['clean_skills'].str.len() == 0] user_indices_to_match = data[data['clean_skills'].str.len() > 0]["index"] fname = get_tmpfile("fasttext.model") model = FastText.load(fname) for each_skill in user["clean_skills"].values[0]: similar_users_list = similar_products(model[each_skill], model) # print(similar_users_list) most_common(similar_users_list) indices_found = {} for user_index in user_indices_to_match: if user_index in most_comm: indices_found[user_index] = most_comm[user_index] sorted_indices = [ k for k, v in sorted( indices_found.items(), key=lambda item: item[1], reverse=True) ] print(sorted_indices) return reduce(pd.DataFrame.append, map(lambda i: data[data["index"] == i], sorted_indices))
def get_embedding(df, is_word): """ Generates embedding for sourcecodes provided through the dataframe. :param df: Dataframe containing sourecodes in the column 'sourcecode'. :param is_word: Whether to use fasttext word-embeddings or doc2vec document-embedding. :return: List of embeddings as a numpy array. """ if is_word: model = FastText.load("fasttext.model") else: model = Doc2Vec.load("doc2vec.model") def get_emb(sent): if is_word: AV = np.zeros(model.vector_size) for word in sent: AV += model.wv[word] AV /= len(AV) return AV else: return model.infer_vector(sent) list_of_embeddings = [] for code in df["sourcecode"]: list_of_embeddings.append(get_emb(preprocess_text(code))) return np.array(list_of_embeddings)
def run(self) -> None: wiki_ft_model = FastText.load("./data/fasttext_300.model") while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): try: processed_text = document["processed_text"] id = document["_id"] encoded_processed_text = np.mean( [wiki_ft_model.wv[vec] for vec in processed_text], axis=0) if len(processed_text) == 1: encoded_processed_text = [encoded_processed_text] encoded = list([float(x) for x in encoded_processed_text]) bulk.find({ "_id": id }).update_one({ "$set": { "encoded_processed_text": encoded, "encoded_processed_text_version": 1 } }) except Exception as ex: print(ex, processed_text) bulk.execute()
def _load_gensim_format_embeddings(self): if not os.path.exists(self.word_embedding_file): raise Exception("{} is not found!".format( self.word_embedding_file)) if self.word_embedding_mode.lower() == "fasttext": if self.word_embedding_file.endswith(".model"): model = FastText.load(self.word_embedding_file) else: model = FastText.load_fasttext_format(self.word_embedding_file) elif self.word_embedding_file.endswith(".bin"): model = KeyedVectors.load_word2vec_format(self.word_embedding_file, binary=True) else: model = Word2Vec.load(self.word_embedding_file) embedding_size = model["and"].size unknown_vec = np.random.uniform(-0.25, 0.25, embedding_size) embeddings = [unknown_vec] * (self.n_words()) embeddings[0] = np.zeros(embedding_size) for word in self.word2index: try: embeddings[self.word2index[word]] = model[word] except: # self.word2index[word] = self.word2index[self.UNK_TOKEN] pass self.word_embedding_size = len(embeddings[0]) embeddings = np.array(embeddings, dtype=np.float32) return embeddings
def create_embedding_matrix(X, word2index, mode = 'fasttext'): fasttext_fn = 'assets/embedding_models/ft_reviews_dim100_w5_min50/ft_reviews_dim100_w5_min50.ft_model' glove_fn = 'assets/embedding_models/glove/glove.twitter.27B.100d.txt' if mode == 'fasttext': model = FastText.load(fasttext_fn) dims = model.layer1_size elif mode == 'glove': model = loadGloveModel(glove_fn) dims = 100 else: model = None dims = 0 index2word = {ind: w for w, ind in word2index.items()} a = np.ndarray.flatten(X) indices = list(set(a)) matrix = np.zeros((max(indices)+1,dims),dtype=np.float32) j = 0 for k, ind in enumerate(indices): try: word = index2word[ind] vec = model[word] j += 1 except: vec = np.random.uniform(-1,1,dims).astype(np.float32) matrix[ind] = vec print(' %s perc in model' %(j / max(indices))) return matrix
def read_embedding_matrix(vocab, device, data_path): """ read the embedding matrix passed as parameter and return it as an vocabulary of each word with the corresponding embeddings Args: path ([type]): [description] # we need to use tensorflow embedding lookup heer """ model_path = Path.home().joinpath( "Projects", "Personal", "balobi_nini", 'models', 'embeddings_one_gram_fast_tweets_only').__str__() embeddings_path = Path().cwd().joinpath('data', 'preprocess', "embedding_matrix.npy") model_gensim = FT_gensim.load(model_path) vectorized_get_embeddings = np.vectorize(model_gensim.wv.get_vector) embeddings_matrix = np.zeros( shape=(len(vocab), 50)) #should put the embeding size as a vector print("starting getting the word embeddings ++++ ") vocab = vocab.ravel() for index, word in tqdm(enumerate(vocab)): vector = model_gensim.wv.get_vector(word) embeddings_matrix[index] = vector print("done getting the word embeddings ") with open(embeddings_path, 'wb') as file_path: np.save(file_path, embeddings_matrix) embeddings = torch.from_numpy(embeddings_matrix).to(device) return embeddings
def get_wiki_ft_model(): global wiki_ft_model if wiki_ft_model is None: print("getting wiki model") wiki_ft_model = FastText.load(f"{base_path()}fasttext_300.model") print("Wiki model loaded") return wiki_ft_model
def synonym_noise(args, x_batch, maxlen, tokenizer): articles = tokenizer.sequences_to_texts(x_batch) changed_articles = [] model_embedd = FastText.load(args.embeddingPath + 'embedding.model') for article in articles: word_array = article.split(' ') sent1 = [] '''toss and taking random decision on data''' if np.random.binomial(1, args.synonym_noise_b1): for word in word_array: if word in model_embedd.wv.vocab: most_similar = model_embedd.wv.most_similar(word) # print(most_similar[0][0]) #flipping coin to decide to change or not if head change word and if tails dont change #change p value for reducing or increasing the edit if np.random.binomial(1, args.synonym_noise_b2): sent1.append(most_similar[0][0]) else: sent1.append(word) else: sent1.append(word) joined_text = ' '.join(sent1) else: joined_text = ' '.join(word_array) changed_articles.append(joined_text) x_train_seq_n = tokenizer.texts_to_sequences(changed_articles) x_train_seq_n = sequence.pad_sequences(x_train_seq_n, maxlen=maxlen) x_train_seq_n = tf.convert_to_tensor(x_train_seq_n) return x_train_seq_n
def get_fasttext_model(self): ''' load fasttext model ''' FASTTEXT_MODEL = FastText.load(self.config.contract_model) print("contract fasttext_model loaded") return FASTTEXT_MODEL
def get_array_thems(title): ans = [] path = 'Module-new-neural-network/aii/model.bin' model = FastText.load(path) for i in range(len(title)): ans.append(get_one_header_news_element(title[i], model)) return ans
def __internal_new__(cls): cls._model_path = str( Path(str(current_app.root_path)) / "resources/fasttext_model/ft_sg.model" ) cls._sg_model = FastText.load(cls._model_path) cls._logger = current_app.logger return super().__new__(cls)
def __init__(self): self.path = '../../data/' self.results = '../../data/results/' self.fname = 'df_metacyc_multilabel_1.pkl' self.kegg = 'df_kegg.pkl' self.model_name = 'model5-10-100.model' self.models = '../../data/models/' self.model = FT_gensim.load(self.models + 'model5-10-100.model')
def load_embeddings(embedding_file): ''' loading embeddings from file input: embeddings output: embeddings in a dict-like structure available for look-up, vocab covered by the embeddings as a set ''' print('Using embeddings: ', embedding_file) if embedding_file.endswith('.txt') or embedding_file.endswith('.vec'): w2v = {} vocab = [] try: f = open(embedding_file, 'r') for line in f: values = line.split() word = values[0] try: float(values[1]) except ValueError: continue coefs = np.asarray(values[1:], dtype='float') w2v[word] = coefs vocab.append(word) except UnicodeDecodeError: f = open(embedding_file, 'rb') for line in f: values = line.split() word = values[0] try: float(values[1]) except ValueError: continue coefs = np.asarray(values[1:], dtype='float') w2v[word] = coefs vocab.append(word) f.close() else: try: w2v = FT_gensim.load(embedding_file) vocab = w2v.wv.vocab.keys() print('using FastText gensim...') except: try: w2v = FT_gensim.load_fasttext_format(embedding_file) vocab = w2v.wv.vocab.keys() print('using gensim Facebook FastText...') except: w2v, vocab = load_vectors(embedding_file) print('using Facebook fastText') try: print("Done.", len(w2v), " words loaded!") except: pass return w2v, vocab
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def load_model(model_path): model_type = decide_model_type(os.path.basename(model_path)) if model_type == 'word2vec': model = Word2Vec.load(model_path) elif model_type == 'fasttext': model = FastText.load(model_path) else: model = [] return model
def read_wv_model(self, model_name='embeddings_one_gram_fast_tweets_only'): """ read the word to vv embedding model passed in parameter Args: path ([type]): [description] """ model_path = Path.cwd().joinpath('models', model_name).__str__() self.model_gensim = FT_gensim.load(model_path)
def _load_vector_data(self): try: self.vectorizer = Word2Vec.load( "train/word2vec.bin" ) if self.vectorizer_name == "word2vec" else FastText.load( "train/fasttext.bin") except FileNotFoundError: print("Vectorizer train data not found") self.vectorizer = None
def test_online_learning_after_save(self): model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) self.assertTrue(len(model_neg.wv.ngrams), 271)
def load_fasttext_model(model_path): ft_model = None start_time = time.time() if os.path.exists(model_path): logging.info("model load from: %s" % model_path) ft_model = ft.load(model_path) else: logging.error("model_path doesn't exist.") logging.info("cost time = %.4fs" % (time.time() - start_time)) return ft_model
def load_fasttext_model(path): try: model = FastText.load(path).wv except Exception as e: try: model = FastText.load_fasttext_format(path).wv except Exception as e: model = gensim.models.KeyedVectors.load(path) return model
def __init__(self, index_name, es_text_field, save_model_pth, num_training_epochs, use_analyzed_field=False, init_from_pretrained=False, pretrained_path=None, bsize=128, emb_dim=300, window=5, min_count=1, workers=12, max_vocab_size=None, must_have_fields=None, must_not_have_fields=None): assert (not init_from_pretrained and not pretrained_path) or (init_from_pretrained and pretrained_path), \ "If init_from_pretrained=True, pretrained_path must be provided" if must_have_fields is None: must_have_fields = [] if must_not_have_fields is None: must_not_have_fields = [] # General parameters self.index_name = index_name self.es_text_field = es_text_field self.bsize = bsize self.use_analyzed_field = use_analyzed_field self.save_model_pth = save_model_pth self.num_training_epochs = num_training_epochs self.must_have_fields = must_have_fields + [es_text_field] self.must_not_have_fields = must_not_have_fields # Training parameters self.emb_dim = emb_dim # May be updated below if using a pretrained model self.window = window self.min_count = min_count self.workers = workers self.max_vocab_size = max_vocab_size # Initialize model if init_from_pretrained and pretrained_path: try: self.model = load_facebook_model(pretrained_path) except NotImplementedError: self.model = FastText.load(pretrained_path) except Exception as e: raise RuntimeError("Fasttext model is neither facebook nor gensim model: {}".format(e)) self.emb_dim = self.model.vector_size print('Ignoring arg `emb_dim` since loading a pretrained model. Emb_dim is set to {}'.format(self.emb_dim)) # Update parameters self.model.workers = self.workers # Start iterating corpus and building vocab print('Updating vocabulary with first pass over corpus') self.model.build_vocab( sentences=TextFieldIterator( index_name, es_text_field, self.must_have_fields, self.must_not_have_fields, bsize=self.bsize, use_analyzed_field=self.use_analyzed_field ), update=True) else: # More parameters can be exposed when creating a new model; # https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText # Instantiate model self.model = FastText(size=emb_dim, window=window, min_count=min_count, workers=workers, max_vocab_size=max_vocab_size) # Start iterating corpus and building vocab self.model.build_vocab( sentences=TextFieldIterator( index_name, es_text_field, self.must_have_fields, self.must_not_have_fields, bsize=self.bsize, use_analyzed_field=self.use_analyzed_field, ) )
def test_persistence(self): model = FT_gensim(sentences, min_count=1) model.save(testfile()) self.models_equal(model, FT_gensim.load(testfile())) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(testfile()) loaded_wv = FastTextKeyedVectors.load(testfile()) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
def load(filename): if '.bin' in filename: model = load_bin_vectors(filename, True) elif 'fasttext' in filename: model = FastText.load(filename) elif '.wtv' in filename: model = Word2Vec.load(filename) else: model = load_bin_vectors(filename, False) dim = model.vector_size return model, dim
def test_persistence(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def _embedding_load_trained(path, hParams): try: if hParams.embedding_type == "fasttext": model_embedding = FastText.load(path + "_" + hParams.embedding_type) elif hParams.embedding_type == "word2vec": model_embedding = Word2Vec.load(path + "_" + hParams.embedding_type) print("[+] Embedding model successfully loaded from {}".format(path + "_" + hParams.embedding_type)) return model_embedding except: raise FileNotFoundError("[!] Embedding model couldn't be loaded from {}".format(path + "_" + hParams.embedding_type))
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)