def main(ft_src, ft_tgt, tm_model, corpus, hyp_num=1000): ft_src_model = FastText.load_fasttext_format(ft_src) ft_tgt_model = FastText.load_fasttext_format(ft_tgt) tm = pickle.load(open(tm_model, 'rb')) for i, line in enumerate(io.open(corpus, 'r', encoding='utf-8')): sent_pairs = line.strip().split(' ||| ') source = sent_pairs[0] targets = merge_subseq([tok.split() for tok in sent_pairs[1:]]) G = nx.DiGraph() for tokens in targets: G = add_edges(G, handle_tokens(tokens)) source_vec = get_vec(ft_src_model, source) source_vec_proj = tm.predict(source_vec.reshape(1, -1))[0] source_vec_proj = unitvec(source_vec_proj) candidates = Counter() for p in sorted(nx.all_simple_paths(G, ('<s>', 0), ('</s>', 0)), key=lambda path: path_cost(G, path, weight='weight')): target_tokens = [t[0] for t in p[1:-1]] target_tokens = ' '.join(target_tokens).split() # hack text = ' '.join(target_tokens) target_vec = unitvec(get_vec(ft_tgt_model, target_tokens)) candidates[text] = np.dot(source_vec_proj, target_vec) for text, score in candidates.most_common(hyp_num): print('{0} ||| {1} ||| Cosine={2}'.format(i, text, score))
def _generate_word_embeddings(self, algo=EmbeddingsAlgorithm.WORD2VEC, use_morphs=False, min_count=2, dim=100): """Generates the word embeddings for the current language :param use_morphs: If true, will use the morphed corpus to generate embeddings. If false, will use the raw corpus :param min_count: The minimum number of times a word must occur in order for it to be processed :param dim: The number of dimensions of the output vectors :return: The embeddings for the current languagego """ _log.info('Learning word vectors...') if algo == EmbeddingsAlgorithm.WORD2VEC: if use_morphs: return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count) else: return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count) elif algo == EmbeddingsAlgorithm.FASTTEXT: if use_morphs: self._split_corpus_into_morphs() self._save_language_data('fastTest_input.txt') return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt', output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count) else: self._save_language_data('fasttext_input.txt') return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt', output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count) else: _log.error('Unknown algorithm %s' % algo)
def fasttext_model_from_file2(file_path): save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1]) try: model = gensimFastText.load_fasttext_format(save_file_name + '.bin', encoding='utf-8') logging.info('model loaded:' + save_file_name) except FileNotFoundError: fastext_bin_path = os.path.join(const.ROOT_DIR, 'fasttext/fastText') model = gensimFastText.train(fastext_bin_path, file_path, min_count=1) return model.wv
def computeCorpusSims(name,lg): if lg=="fr":model = FastText.load_fasttext_format(fr) else:model = FastText.load_fasttext_format(en) data=pd.read_csv(name, sep='\t') texts=data["text"] titles=data["title"] size=len(texts) sims=[] for i in range(size):sims+=[meanSim(ensText(texts[i],model),MWV(titles[i],model))] sims.sort() return sims
def main(ft_src, ft_tgt, corpus_src, corpus_tgt, out_fname): ft_src_model = FastText.load_fasttext_format(ft_src) ft_tgt_model = FastText.load_fasttext_format(ft_tgt) X = get_vec(ft_src_model, corpus_src) y = get_vec(ft_tgt_model, corpus_tgt) assert X.shape == y.shape, 'mismatched shapes' lr = LinearRegression() lr.fit(X, y) with io.open(out_fname, 'wb') as out: pickle.dump(lr, out, pickle.HIGHEST_PROTOCOL)
def prepare_word_emb_matrices(experiment): """ Initializes word embeddings for each word in training vocabulary from pretrained or custom-trained embedding files :param experiment: the ID of the word embedding file :return: the training embedding matrix """ with open("public_data/stats/stats_train.pkl", 'rb') as stats: stats = pickle.load(stats) vocab = stats["VOCAB"] stops = [word.lower() for word in set(stopwords.words('english'))] vocab = vocab + stops if experiment == "RANDOM": word_embs = np.random.uniform(low=-1.0, high=1.0, size=(len(vocab), PARAMS["SIZE"])).astype("float32") else: word_embs = [] count_unk = 0 count_kn = 0 if experiment == "5": emb_model = KeyedVectors.load_word2vec_format( "public_data/models/experiment_5/embeddings_5.bin", binary=True) elif experiment == "6": emb_model = Word2Vec.load( "public_data/models/experiment_6/embeddings_6") elif experiment in ["7", "8"]: emb_model = FastText.load_fasttext_format( "public_data/models/experiment_%s/embeddings_%s.bin" % (experiment, experiment)) for word in vocab: if word in emb_model: word_embs.append(emb_model[word]) count_kn += 1 else: word_embs.append( np.random.uniform(low=-1.0, high=1.0, size=PARAMS["SIZE"])) count_unk += 1 word_embs = np.array(word_embs).astype("float32") print(count_unk / (count_kn + count_unk)) pad = np.zeros(shape=PARAMS["SIZE"]).astype("float32") unk = np.random.uniform(low=-1.0, high=1.0, size=PARAMS["SIZE"]).astype("float32") word_embs = np.insert(word_embs, 0, unk, axis=0) #id 1 word_embs = np.insert(word_embs, 0, pad, axis=0) #id 0 with open("public_data/embeddings/word_embeddings_%s.pkl" % experiment, 'wb') as out: pickle.dump(word_embs, out, protocol=4) return word_embs
def __init__(self, model_path, model_type='fasttext', **kwarg): if model_type == "fasttext": self._model = FastText.load_fasttext_format(model_path) elif model_type == "word2vec": self._model = Word2Vec.load_word2vec_format(model_path) else: raise NotImplementedError("other model is not supported")
def __init__(self): # add data imports self.data_df = pd.read_csv("sarcasm/train-balanced-sarcasm.csv") self.data_split = data_split # self.stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', # 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', # 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', # 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', # 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', # 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', # 'further', 'was', 'here', ] self.stop_words = [] print("Loading Vectors") self.vec_model = FastText.load_fasttext_format( '../../Science_Fair_Project/vectors/cc.en.300.bin/cc.en.300.bin') # self.vec_model = {} print("Completed Loading Vectors") # ipdb.set_trace() self.data_df = self.data_df[["comment", "parent_comment", "label"]] self.data_df = shuffle(self.data_df) self.data_df["label"] = self.data_df["label"].astype(int) self.data_df["comment"] = self.data_df["comment"].astype(str) self.data_df["comment"] = self.data_df["comment"].str.lower() self.data_df["comment"] = self.data_df["comment"].str.strip( to_strip=".!?,") self.data_df["comment"] = self.data_df["comment"].str.split() self.data_df["parent_comment"] = self.data_df["parent_comment"].astype( str) self.data_df["parent_comment"] = self.data_df[ "parent_comment"].str.lower() self.data_df["parent_comment"] = self.data_df[ "parent_comment"].str.strip(to_strip=".!?,") self.data_df["parent_comment"] = self.data_df[ "parent_comment"].str.split()
def get_model(my_corpus, f_embeddings): """Get the appropriate model for the corpus""" global current_model f_output = "" if my_corpus == "Corola.300.20": current_model = KeyedVectors.load_word2vec_format(f_embeddings, binary=False) f_output = "Results/Corpus_Similarities/CoRoLa_300_20-cosine_similarity.txt" elif my_corpus == "Corola.400.5": current_model = KeyedVectors.load_word2vec_format(f_embeddings, binary=False) f_output = "Results/Corpus_Similarities/CoRoLa_400_5-cosine_similarity.txt" elif my_corpus == "Facebook": current_model = FastText.load_fasttext_format(f_embeddings) f_output = "Results/Corpus_Similarities/fastText-cosine_similarity.txt" elif my_corpus == "CONLL2017-Word2vec": current_model = KeyedVectors.load_word2vec_format(f_embeddings, binary=True) f_output = "Results/Corpus_Similarities/CoNLL-2017-cosine_similarity.txt" return f_output
def load_embeddings_file(file_name, embedding_type, lower=True): if not os.path.isfile(file_name): print(file_name, "does not exist") return {}, 0 if embedding_type == "word2vec": model = KeyedVectors.load_word2vec_format(file_name, binary=True, unicode_errors="ignore") words = model.index2entity elif embedding_type == "fasttext": model = FastText.load_fasttext_format(file_name) words = [w for w in model.wv.vocab] else: print("Unknown Type") return {}, 0 if lower: vectors = {word.lower(): model[word] for word in words} else: vectors = {word: model[word] for word in words} if "UNK" not in vectors: unk = np.mean([vectors[word] for word in vectors.keys()], axis=0) vectors["UNK"] = unk return vectors, len(vectors["UNK"])
def setup(self, log=True): hp = HP(base_dir=self.dir) if self.conf.trainer_type == "DST": for attr, val in self.conf.hyperparameter.items(): if val is not None: setattr(hp, attr, val) self.trainer = DST(hp=hp) elif self.conf.trainer_type == "CDST": for attr, val in self.conf.hyperparameter.items(): if val is not None: setattr(hp, attr, val) self.trainer = CDST(hp=hp) elif self.conf.trainer_type == "FT_DST": for attr, val in self.conf.hyperparameter.items(): if attr != "embedding_dim": if val is not None: setattr(hp, attr, val) if not self.fast_text_model: print("Setting fast_text_model...", end="") self.fast_text_model = FastText.load_fasttext_format( hp.fast_text_model_file) print("Ok.") self.trainer = FT_DST(fast_text_model=self.fast_text_model, hp=hp) else: raise ValueError("Unknown Trainer") self.log = log if log: self.tee = Tee(str(self.trainer.hp.dir / "log.txt")) sys.stdout = self.tee
def get_embeddings_authors(self, authors_pondered_tokens: Dict[str, List[Tuple[str, float]]]): self.model_embeddings = FastTextWrapper.load_fasttext_format(TextProcessings.FAST_TEXT_PATH) authors_embeddings = [] print('computing embeddings') with open('300_authors_embeddings.txt', 'w', encoding='utf-8') as f: for name, tokens_weights in authors_pondered_tokens.items(): weighted_avg_author = np.float32([0] * TextProcessings.WORD_DIM) print(name, file=f) weights, tokens = [], [] unique_tokens = set() for (token, weight) in tokens_weights: if token in self.model_embeddings.wv.vocab and token not in unique_tokens: weights.append(weight) tokens.append(token) unique_tokens.add(token) # normalize weights s.t. is a prob distribution weights = normalize(np.float32([weights]), norm='l1')[0] # print(weights, file=f) # print(weights[0]) for i, token in enumerate(tokens): weighted_avg_author += weights[i] * self.model_embeddings.wv[token] print(token, weights[i] , file=f) authors_embeddings.append((name, weighted_avg_author)) pickle.dump(authors_embeddings, open(TextProcessings.AUTHORS_EMBEDDINGS_FILE, "wb"))
def load_language_model(language_models_path, language_model_name, texts, self_train: bool, save_self_model: bool, self_model_name): """ загрузка/обучение языковой модели :param language_models_path: директория с яз. моделями :param language_model_name: имя яз. модели :param texts: корпус текстов (если обучаем свою модель) :param self_train: обучаем ли свою :param save_self_model: сохранять ли свою обученную :param self_model_name: имя своей модели :return: """ if self_train: language_model = Word2Vec(texts, min_count=0, size=300) if save_self_model: language_model.save(os.path.join(language_models_path, self_model_name + '.w2v')) else: if language_model_name[-3:] == 'bin': language_model = gensim.models.Word2Vec.load(os.path.join(language_models_path, language_model_name)) elif language_model_name[-3:] == 'bin': language_model = FastText.load_fasttext_format(os.path.join(language_models_path, language_model_name)) else: language_model = Word2Vec.load(os.path.join(language_models_path, language_model_name)) return language_model
def __init__(self, embedding_file, sequences, seq_type, k_mer, restricted_kmer=False, use_idf=False, norm=None): ''' Class constructor ''' SequenceKmerRep.__init__(self, sequences, seq_type, k_mer, restricted_kmer=restricted_kmer, use_idf=use_idf, norm=norm, delete_empty_col=True) print('loading embedding..') if embedding_file.split('.')[-1] == 'txt': self.model = KeyedVectors.load_word2vec_format(embedding_file, binary=False) else: self.model = FastText.load_fasttext_format(embedding_file) self.emb_trans = [self.model[x.lower()] for x in self.vocab] # summation vector self.embeddingX = self.X.dot(self.emb_trans) self.emb_kmer_concat = np.concatenate( (self.embeddingX, self.X.toarray()), axis=1)
def read_model(self, tipe): start = time.time() if tipe == 'own-model1': print('Loading {} '.format(str(tipe))) embeddings_path = 'word_embedding_cbow.bin' word2vec_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False, unicode_errors="ignore") elif tipe == 'own-model2': print('Loading {} '.format(str(tipe))) embeddings_path = '/home/pras/Embeddings/model_arif' word2vec_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False, unicode_errors="ignore") elif tipe == 'own-model3': print('Loading {} '.format(str(tipe))) embeddings_path = '/home/adrian/new_cnn/modelapik_cbows.bin' word2vec_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False, unicode_errors="ignore") elif tipe == 'bojanowski': print('Loading {} model'.format(str(tipe))) embeddings_path = '/home/pras/Embeddings/wiki.bin' word2vec_model = FastText.load_fasttext_format(embeddings_path) else: print('Error Embeddings') end = time.time() print('Loading {} done in {} Seconds'.format(str(tipe), (end-start))) print('') self.word2vec = word2vec_model self.embed_dim = 300 return word2vec_model
def load_embeddings(self, file_path): # Embeddins must be in fastText format either bin or print('Loading embeddins...') if file_path.endswith('.bin'): from gensim.models.wrappers import FastText embeddings = FastText.load_fasttext_format(file_path) else: pre_trained_embeddins_dict = dict() with open(file_path) as f: _ = f.readline() for line in f: token, *embedding = line.split() embedding = np.array( [float(val_str) for val_str in embedding]) if token in self.token_dict: pre_trained_embeddins_dict[token] = embedding print('Readed') pre_trained_std = np.std(list(pre_trained_embeddins_dict.values())) embeddings = pre_trained_std * np.random.randn( len(self.token_dict), len(embedding)) for idx in range(len(self.token_dict)): token = self.token_dict.idx2tok(idx) if token in pre_trained_embeddins_dict: embeddings[idx] = pre_trained_embeddins_dict[token] return embeddings
def __init__(self, lang='eng', dim=200): self.dim = dim # self.file_path = 'ko.bin' self.file_path = 'wiki.ko.bin' self.model = FastText.load_fasttext_format(self.file_path)
def load_word_vectors(wordchar2vector_path, word2vector_path): """ Фабрика для получения удобного доступа к обеим моделям встраивания слов - посимвольной морфологической и пословной синтактико-семантической. :param wordchar2vector путь к файлу с векторами слов в модели посимвольного встраивания. :param word2vector_path путь к файлу с векторами слов в word2vec, fasttext или glove моделях :return экземпляр класса, предоставляющий метод-индексатор и возвращающий объединенный вектор встраивания для слова. """ print('Loading the wordchar2vector model {} '.format(wordchar2vector_path), end='') # Грузим заранее подготовленные векторы слов для модели # встраивания wordchar2vector (см. wordchar2vector.py) wc2v = gensim.models.KeyedVectors.load_word2vec_format(wordchar2vector_path, binary=False) wc2v_dims = len(wc2v.syn0[0]) print('wc2v_dims={0}'.format(wc2v_dims)) if os.path.basename(word2vector_path).startswith('fasttext'): print('Loading FastText model {} '.format(word2vector_path), end='') WordEmbeddings._flush_print() w2v = FastText.load_fasttext_format(word2vector_path) w2v_dims = w2v.vector_size print('w2v_dims={0}'.format(w2v_dims)) return WordEmbeddings_FastText(wc2v, wc2v_dims, w2v, w2v_dims) else: print('Loading w2v model {} '.format(word2vector_path), end='') WordEmbeddings._flush_print() w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vector_path, binary=not word2vector_path.endswith('.txt')) w2v_dims = len(w2v.syn0[0]) print('w2v_dims={0}'.format(w2v_dims)) return WordEmbeddings_W2V(wc2v, wc2v_dims, w2v, w2v_dims)
def evaluatePool(corpus_words, pool, POS_weights): print(str(datetime.now()) + ' start load_fasttext_format') embedding = FastText.load_fasttext_format('embedding.bin', encoding='utf-8') print(str(datetime.now()) + ' finish load_fasttext_format') for sentence in pool: # lexic n = sentence._words if n == 0: continue; oovw = (sentence._oovw / n) rep = 1 / (1 + sentence._repetitions / n) lexic = (oovw + rep) / 2 # gramatic gramatic = 0 for pos, pos_weight in POS_weights.items(): gramatic += pos_weight * (getattr(sentence,'_POS_' + pos)/sentence._words) gramatic = gramatic / len(POS_weights) # print('lexic: ' + str(lexic) + ', gramatic: ' + str(gramatic)) # semantics synonyms = 0 for pool_word in sentence.text.split(): for corpus_word in corpus_words: try: synonyms += bool(embedding.similarity(pool_word, corpus_word) >= similarity_threshold) except: pass semantic = 1 - (n / (n + synonyms)) sentence.score = w * lexic + w * gramatic + w * semantic session.commit()
def get_fasttext_model(dataset="tweet", model_type="bin"): w2v_rootdir = os.path.join(res_basedir, "word2vecs") tweets_rootdir = os.path.join(resources_rootdir, "tweet_w2v", "tweet_fasttext") ds_rootdir = os.path.join(resources_rootdir, "ds_aa", "fasttext_embs") amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs") # amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs_50_eps") if dataset == "tweet": model_path = os.path.join(tweets_rootdir, "tweet_fasttext.{}".format(model_type)) elif dataset == "ds": model_path = os.path.join(ds_rootdir, "ds_fasttext.{}".format(model_type)) elif dataset == "amazon": model_path = os.path.join(amazon_rootdir, "amazon_fasttext.{}".format(model_type)) elif dataset == "wiki": model_path = os.path.join(w2v_rootdir, "wiki.en/wiki.en.{}".format(model_type)) elif dataset == "simple": model_path = os.path.join( w2v_rootdir, "wiki.simple/wiki.simple.{}".format(model_type)) print "fasttext model: ", model_path if model_type == "bin": model = FastText.load_fasttext_format(model_path) else: model = KeyedVectors.load_word2vec_format(model_path, binary=False) return model
def load_ft(): w2v_model = FastText.load_fasttext_format('../embedding/cc.zh.300.bin') print("Finish Load") dim = len(w2v_model['好']) fw1 = codecs.open("../embedding/embedding_all_ftoov_%d.txt" % (dim), 'w', encoding='utf-8') vocab_dict = pickle.load(open('../data/vocabulary.pkl', 'rb')) word_list = ['unk' for i in range(len(vocab_dict))] for k, v in vocab_dict.items(): word_list[v] = k # print(word_list) embedding_matrix = np.zeros((len(vocab_dict), dim)) miss = 0 for index, w in enumerate(word_list): if index % 1000 == 0: print(index) try: # in_set.add(w) embeds = np.asarray(w2v_model[w]) except: w2v_model.most_similar(w) miss += 1 print(w) embeds = np.random.uniform(-0.25, 0.25, dim) embedding_matrix[index] = embeds fw1.write(str(len(word_list)) + ' ' + str(dim)+'\n') for index, w in enumerate(word_list): fw1.write(w) for i in embedding_matrix[index]: fw1.write(' ' + str(i)) fw1.write('\n') pickle.dump(vocab_dict, open('../data/vocabulary2.pkl', 'wb')) print(len(word_list)) print("miss:%d" % miss)
def train_fasttext(data_dir='./data', dim=300, epoch=5, ft_model='skipgram', ft_lr=0.05, ft_window=5): data_dir = Path(data_dir) import fasttext model = fasttext.train_unsupervised( str(data_dir / 'ocb_and_wikisource.w2v_tokens.txt'), model=ft_model, lr=ft_lr, # learning rate [0.05] dim=dim, # size of word vectors [100] ws=ft_window, # size of the context window [5] epoch=epoch # number of epochs [5] # thread # number of threads [number of cpus] ) model.save_model(str(data_dir / 'ocb_and_wikisource.fasttext.bin')) from gensim.models.wrappers import FastText ft_model = FastText.load_fasttext_format( str(data_dir / 'ocb_and_wikisource.fasttext.bin')) ft_model.wv.save_word2vec_format(data_dir / 'ocb_and_wikisource.fasttext.w2v.txt') logger.info('done')
def train_fasttext(hf_dataset, output_dir): """ Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output :return: """ tokens_fp = os.path.join(output_dir, 'tokens.txt') fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin') fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt') docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') # Tokenized text doc_delimiter = '\n' token_delimiter = ' ' tokens_count = 0 with open(tokens_fp, 'w') as f: for doc in docs_ds: # Extract plain text text = doc['title'] + ': ' + doc['abstract'] for token in gensim.utils.simple_preprocess(text, min_len=2, max_len=15): f.write(token + token_delimiter) tokens_count += 1 f.write(doc_delimiter) logger.info(f'Total tokens: {tokens_count:,}') # Train actual fasttext model logger.info(f'Train fastext model...') model = fasttext.train_unsupervised( tokens_fp, model='skipgram', lr=0.05, # learning rate [0.05] dim=300, # size of word vectors [100] ws=5, # size of the context window [5] epoch=5 # number of epochs [5] # thread # number of threads [number of cpus] ) model.save_model(fasttext_bin_fp) del model ft_model = FastText.load_fasttext_format(fasttext_bin_fp) ft_model.wv.save_word2vec_format(fasttext_w2v_fp) logger.info(f'Output saved to: {fasttext_w2v_fp}') logger.info('Done')
def load_embeddings_file(file_name, lower=False, type=None): if type == None: file_type = file_name.rsplit(".",1)[1] if '.' in file_name else None if file_type == "p": type = "pickle" elif file_type == "bin": type = "word2vec" elif file_type == "vec": type = "fasttext" else: type = "word2vec" if type == "word2vec": model = KeyedVectors.load_word2vec_format(file_name, binary=True, unicode_errors="ignore") words = model.index2entity elif type == "fasttext": model = FastText.load_fasttext_format(file_name) words = [w for w in model.wv.vocab] elif type == "pickle": with open(file_name,'rb') as fp: model = pickle.load(fp) words = model.keys() if lower: vectors = {word.lower(): model[word] for word in words} else: vectors = {word: model[word] for word in words} if "UNK" not in vectors: unk = np.mean([vectors[word] for word in vectors.keys()], axis=0) vectors["UNK"] = unk return vectors, len(vectors["UNK"])
def __init__(self, path, model, max_length, word_dim): filePath = open(path) self.model = FastText.load_fasttext_format(model) self.data = list(csv.reader(filePath)) self.encoder = preprocessing.LabelEncoder() self.encoder.fit(list(map(lambda x: x[1], self.data))) self.max_length = max_length self.word_dim = word_dim
def load_test(): model = fasttext.load_model("/home/zhoutong/nlp/data/cc.en.300.bin") vec1 = model.get_word_vector("china") vec2 = model.get_word_vector("america") similarity(vec1,vec2) sen_vec1 = model.get_sentence_vector("I come from china") sen_vec2 = model.get_sentence_vector("I am chinese") np.concatenate([model.get_word_vector(i) for i in ["I","am","chinese"]]) / 3 similarity(sen_vec1,sen_vec2) gensim_model = FastText.load_fasttext_format('/home/zhoutong/nlp/data/cc.en.300.bin') # 10min gensim_model.most_similar('teacher') gensim_model.similarity('teacher', 'teaches') gensim_model.init_sims(replace=True) gensim_model.save('/home/zhoutong/nlp/data/cc.en.300.bin.gensim') gensim_model_new = FastText.load('/home/zhoutong/nlp/data/cc.en.300.bin.gensim',mmap='r')
def __init__(self, *args, hp=None, fast_text_model=None, **kwargs): hp = hp or HP() hp.fast_text = True hp.embedding_dim = 300 super().__init__(*args, hp=hp, **kwargs) self.decode = np.vectorize(lambda x: x.decode("utf-8")) assert fast_text_model or self.hp.fast_text_model_file self.fast_text_model = fast_text_model or FastText.load_fasttext_format( self.hp.fast_text_model_file)
def provide_fasttext_model(): assure_fasttext_model_exists() print('providing fasttext model ...') model = FastText.load_fasttext_format( os.path.join(FASTTEXT_MODEL_BASE_DIR, FASTTEXT_MODEL_BIN_NAME)) print('succesfully provided fasttext model') return model.wv
def load_fasttext_model(path): """ Load a pre-trained FastText model. :param path: path of the file of the pre-trained FastText model :return: a pre-trained FastText model :type path: str :rtype: gensim.models.keyedvectors.KeyedVectors """ return FastText.load_fasttext_format(path)
def get_fasttext_embed(list_of_words): words_embed_dict = dict() ff_model = FastText.load_fasttext_format(fast_text_file) for w in list_of_words: try: words_embed_dict[w] = ff_model[w] except: pass return words_embed_dict
def load_embeddings(self, file_path): # Embeddins must be in fastText format either bin or print('Loading embeddins...') if file_path.endswith('.bin'): from gensim.models.wrappers import FastText embeddings = FastText.load_fasttext_format(file_path) else: from gensim.models import KeyedVectors embeddings = KeyedVectors.load_word2vec_format(file_path) return embeddings
def printvec(train_path, vec_path): #1.小文字化など前処理したファイルを作成 print('\nPreprpcessing training data...') tmp_path=train_path[:-4]+'_cleaned.txt' with open(train_path) as f_in: with open(tmp_path, 'w') as f_out: for line in f_in: text=line.lower() text = re.sub(r"[^a-z ]", "", text) text = re.sub(r"[ ]+", " ", text) f_out.write(text) train_path=tmp_path #2.辞書の作成 print('\nMake dic...') s=set() with open(train_path) as f: for line in f: text=line.lower() text = text.replace("\n", " ").replace('\r','') text = re.sub(r"[ ]+", " ", text) text_list=text.split(" ") tmp_set=set(text_list) s.update(tmp_set) words = sorted(list(s)) len_words=len(words) word_indices = dict((c, i+1) for i, c in enumerate(words)) indices_word = dict((i+1, c) for i, c in enumerate(words)) # 0番目はパディング用の数字なので使わないことに注意 #3.fasttextの学習 myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext' ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0) ft_model.save(today_str+'ft.model') #4.ベクトルのファイル出力 with open(vec_path, 'w') as file: for i in range(len_words): if i!=0: word=indices_word[i] if word in ft_model.wv.vocab: vec=ft_model[word] else: vec=np.zeros((vec_size),dtype=np.float32) output=word+' > 'str(vec)+'\n' file.write(output) #5.モデルをリセット ft_model.reset_weights()
#単語から辞書IDを返す def search_word_indices(word): if word in word_indices: return word_indices[word] else: return word_indices["#OTHER"] #fasttextの学習 vec_size=100 print('Learning fasttext...') myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext' ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0) ft_model.save(today_str+'ft.model') # FastTextはcbowとskipgramの二つの学習方法があるがデフォルトではcbow print_time('FastText end') #word2vecのベクトルを得る #未知語の場合には[0,0,0, ... ,0]みたいなやつにとりあえずしてる #未知語は集合に格納し,あとでファイル出力 #要改良 KeyError_set=set() def get_ft_vec(word): if word in ft_model.wv.vocab: return ft_model[word] else: