def train_fasttext_fashionrec(dimensionality, context, train_model, epochs): """ Train with FastText on IG corpora""" total_count, vocab_size = corpus_stats("data/clean2_corpus.txt") print("total word count: {}, vocabulary size: {}".format( total_count, vocab_size)) start_time = datetime.now() model = train_unsupervised(input=os.path.join("data/clean2_corpus.txt"), dim=dimensionality, ws=context, model=train_model, epoch=15) time_elapsed = datetime.now() - start_time output_path = "trained/fasttext_fashion_dim" + str( dimensionality) + "_c" + str(context) + "_" + str(train_model) model.save_model(output_path + ".bin") save_fasttext_bin_to_vec(load_model(output_path + ".bin"), output_path + ".vec") fileName = "results/training/fasttext_fashion_epoch" + str( epochs) + "_d" + str(dimensionality) + "_c" + str(context) + "_" + str( train_model) + ".txt" notes = "FastText FashionData, " + str(epochs) + " epochs, " + str( dimensionality) + " dim, " + str(context) + " context, " + str( train_model) + " train mode\n" + "Training time: " + str( time_elapsed) save_to_file(fileName, notes)
def train_fasttext_skipgram(self, corpus_path, output_path, **kwargs): """ input training file path (required) output output file path (required) lr learning rate [0.05] lrUpdateRate change the rate of updates for the learning rate [100] dim size of word vectors [100] ws size of the context window [5] epoch number of epochs [5] minCount minimal number of word occurences [5] minCountLabel neg number of negatives sampled [5] wordNgrams max length of word ngram [1] loss loss function {ns, hs, softmax} [ns] bucket number of buckets [2000000] minn min length of char ngram [3] maxn max length of char ngram [6] thread number of threads [12] t sampling threshold [0.0001] """ print("Training Fasttext model using Skipgram method") self.fasttext_model = fasttext.train_unsupervised(corpus_path, model='skipgram', **kwargs) self.fasttext_model.save_model(path=output_path) print("Model saved!")
def buildW2VDict(method='FastText'): w2vdict = {} # the value should be (D, ) ndarray if method == 'FastText': #https://fasttext.cc/docs/en/english-vectors.html download wordvectors w2vmodel = train_unsupervised(embedding_txt_file, model='skipgram', lr=0.05, dim=D, ws=2, epoch=5, minCount=2, minCountLabel=0, minn=3, maxn=6, neg=5, wordNgrams=3, loss='ns', bucket=2000000, thread=5, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='') #w2vmodel.save_model(ftmodel_file) print('fastText word vector model trained as ftmodel.bin') # Turn it into dict (word, freq) = w2vmodel.get_words(include_freq=True) for w, f in zip(word, freq): w2vdict[w] = w2vmodel.get_word_vector(w) print('w2vdict built.') return w2vdict, w2vmodel
def train_fasttext(corpus, cut_func, vocabulary, embedding_dim=300): corpus = [' '.join(cut_func(sentence)) for sentence in corpus] corpus_file_path = 'fasttext_tmp_corpus.txt' with open(corpus_file_path, 'w', encoding='utf8') as writer: for sentence in corpus: writer.write(sentence + '\n') model = train_unsupervised(input=corpus_file_path, model='skipgram', epoch=10, minCount=1, wordNgrams=3, dim=300) model_vocab = model.get_words() emb = np.zeros(shape=(len(vocabulary) + 1, embedding_dim), dtype='float32') nb_unk = 0 for w, i in vocabulary.items(): if w not in model_vocab: nb_unk += 1 emb[i, :] = np.random.normal(0, 0.05, embedding_dim) else: emb[i, :] = model.get_word_vector(w) print( 'Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}' .format(emb.shape, nb_unk)) os.remove(corpus_file_path) return emb
def train(corpus, model_file): model = fastText.train_unsupervised(input=corpus, model='skipgram', dim=100, epoch=50, bucket=200000, wordNgrams=2) model.save_model(model_file)
def get_embedding_model(file, model_path, **kwargs): # 加载/训练词向量模型 if os.path.exists(model_path): model = ft.load_model(model_path) else: model = ft.train_unsupervised(file, **kwargs) model.save_model(model_path) return model
def build_unsupervised_model(data, kwargs): kwargs = default_kwargs(kwargs) with tempfile.NamedTemporaryFile(delete=False) as tmpf: for line in data: tmpf.write((line + "\n").encode("UTF-8")) tmpf.flush() model = train_unsupervised(input=tmpf.name, **kwargs) return model
def __init__(self, pretrain=True, update=True, fasttext_corpus_path=None): if pretrain: self.model = fastText.load_model('resource/cc.en.300.bin') return model_path = re.sub(r'\.txt$', '.model', fasttext_corpus_path) if update or not os.path.exists(model_path): self.model = fastText.train_unsupervised(input=fasttext_corpus_path, model='skipgram') self.model.save_model(model_path) else: self.model = fastText.load_model(model_path)
def train_embeddings(self, path_to_songs): """ Train fastText embeddings based on input file :param path_to_songs: :return: """ self.model_ft = fastText.train_unsupervised( input=path_to_songs, model='skipgram', ) self.model_ft.save_model(self.model_path + '1.bin')
def train_skipgram(processed_path): print("Train fastText...") model = train_unsupervised(input=processed_path + "/train_no_mark.txt", model='skipgram', dim=100, minCount=5, ws=10, lrUpdateRate=1000, epoch=50, thread=60) model.save_model(processed_path + '/fasttext.bin')
def unsupervised_trainer_for_germeval(): """ This method tries to train fasttext on db+germeval combined data :return: """ input_file = os.path.join(*[os.path.curdir, 'dataset', 'combined_db_tweets_and_germeval_train.txt']) output_file = os.path.join(*[os.path.curdir, 'dataset', 'germeval_100.de.bin']) # with open(input_file, 'r') as f: # for line in f: # print(fastText.tokenize(line)) germeval_trained = fastText.train_unsupervised(input = input_file, lr = .05, dim = 100, epoch = 100) germeval_trained.save_model(path = output_file)
def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] = None) -> None: """ Train fasttext model on the given dataset of code identifiers. :param data: Dataframe with columns Columns.Split and Columns.Frequency. :param config: Parameters for training the model, options: size: Number of identifiers to pick from the given data to train fasttext on. corrupt: Value indicating whether to make random artificial typos in \ the training data. Identifiers are corrupted with `typo_probability`. typo_probability: Token corruption probability if `corrupt == True`. add_typo_probability: Probability of second corruption in a corrupted token. \ used if `corrupt == True`. path: Path where to store the trained fasttext model. dim: Number of dimensions for embeddings in the new model. bucket: Number of hash buckets to keep in the fasttext model: \ the less there are, the more compact the model gets. adjust_frequencies: Whether to divide frequencies by the number of tokens in \ the identifiers. Needs to be done when the result of the \ `prepare` function is used as data to have a true \ identifiers distribution. """ try: import fastText except ImportError: sys.exit("Please install fastText." "Run `pip3 install git+https://github.com/facebookresearch/fastText" "@51e6738d734286251b6ad02e4fdbbcfe5b679382`") log = logging.getLogger("train_fasttext") if config is None: config = {} config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config) tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split())) if config["adjust_frequencies"]: weights = data[Columns.Frequency] / tokens_number else: weights = data[Columns.Frequency] train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True) if config["corrupt"]: train_data = corrupt_tokens_in_df(train_data, config["typo_probability"], config["add_typo_probability"]) with tempfile.NamedTemporaryFile() as ids_file: with open(ids_file.name, "w") as f: for token_split in train_data[Columns.Split]: f.write(token_split + "\n") log.info("Training fasttext model...") model = fastText.train_unsupervised(ids_file.name, minCount=1, epoch=10, dim=config["dim"], bucket=config["bucket"]) model.save_model(config["path"]) log.info("fasttext model is saved to %s", config["path"])
def build_local_embeddings(self, corpus, path_to_resulting_embeddings): if not os.path.isdir('temp/'): os.mkdir('temp/') with open( 'temp/corpus.txt', 'w', encoding='utf-8', ) as f: f.writelines(corpus) ft = fastText.train_unsupervised('temp/corpus.txt', minCount=1) ft.save_model('temp/ft.bin') del ft self.compress_embeddings(corpus, path_to_resulting_embeddings, 'pca', 'temp/ft.bin') shutil.rmtree('temp/')
def train_w2v_model(data): """ Train a w2v (skipgram) model using fasttext package Args: data: A path to the training data (String) """ logger.info('Fasttext embeddings training...') try: model = train_unsupervised(input=data, model='skipgram', epoch=100, minCount=1, dim=100) model.save_model(str(path.join(data_dir, 'W2V_Models/model.bin'))) except Exception as e: logger.error('Error: %s', str(e))
def search_hyperparameters(self, nb_trials: int, input_path: str, props: str = 'w+l+t+m+n'): for trial in range(nb_trials): parameters = self.tuner.propose() pprint(parameters) ''' Construct and train the model ''' model = train_unsupervised(input=input_path, props=props, **parameters) ''' Track results ''' score = fasttexteval.evaluate_model( model=model, word_pairs=self.train_word_pairs, gold_similarity=self.train_similarity) self.tuner.add(parameters, score) print(f'Evaluation score: {score}')
def test_unsup1(self): train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9") model = train_unsupervised( input=train, model="skipgram", lr=0.025, dim=100, ws=5, epoch=1, minCount=5, neg=5, loss="ns", bucket=2000000, minn=3, maxn=6, t=1e-4, lrUpdateRate=100, thread=self.num_thread(), ) model.save_model(output) path_size = self.get_path_size(output) vectors = {} with open(test, 'r') as test_f: for line in test_f: query0 = line.split()[0].strip() query1 = line.split()[1].strip() vector0 = model.get_word_vector(query0) vector1 = model.get_word_vector(query1) vectors[query0] = vector0 vectors[query1] = vector1 dataset, correlation, oov = compute_similarity(None, test, vectors) correlation = np.around(correlation) self.assertTrue( correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation) ) self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov)) self.assertEqual( path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size) )
def train_embeddings(vocab, corpus, output_file, epoch=500, dim=100, min_count=1): ft = train_unsupervised('./data/' + corpus, epoch=epoch, dim=dim, minCount=min_count) with open('./data/' + vocab, 'r', encoding='utf8') as f: words = f.read().splitlines() with open('./data/' + output_file, 'w', encoding='utf8') as f: f.write('%d %d\n' % (len(words), dim)) for word in words: vec = ft.get_word_vector(word) out = word + ' ' + ' '.join( [str(x) for x in np.around(vec, decimals=4)]) + '\n' f.write(out)
def fit(): ''' fastText训练语料库,save:local_model.bin;利用 预处理和分词 后的pre_data.txt 训练 sentence-vector :return: fit result ''' # 数据预处理。返回 id_url_df 用于结果拼接(id,url) id_url_df = data_process() print('fit 数据预处理,分词完成') # fastText fit model = train_unsupervised( # input=os.path.join(config.jb_path, 'pre_data.txt'), input=os.path.join(config.jb_path, 'del_dp_text_data.txt'), model='skipgram', epoch=10, dim=300, # pretrainedVectors="{}/wiki.zh.vec".format(config.model_path), minCount=1 ) model.save_model("{}/local_model.bin".format(config.model_path)) print('local_model.bin saved') vector_list = [] with open(os.path.join('{}pre_data.txt'.format(config.jb_path))) as f: for line in f: line = line.replace('\n', '') vector = model.get_sentence_vector(line) vector_list.append(vector.tolist()) # 组装 pid + vec 字典 pid = id_url_df['id'].values pid_list = pid.tolist() id_vec_dict = dict(itertools.zip_longest(pid_list, vector_list)) # pid,vector持久化为json with open('{}id_vec_dict.json'.format(config.model_path), 'w') as outfile: json.dump(id_vec_dict, outfile) print('id_vec_dict.json 持久化完成') return 'fit success!!!'
def test_unsup1(self): train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9") model = train_unsupervised( input=train, model="skipgram", lr=0.025, dim=100, ws=5, epoch=1, minCount=5, neg=5, loss="ns", bucket=2000000, minn=3, maxn=6, t=1e-4, lrUpdateRate=100, thread=self.num_thread(), ) model.save_model(output) path_size = self.get_path_size(output) vectors = {} with open(test, 'r') as test_f: for line in test_f: query0 = line.split()[0].strip() query1 = line.split()[1].strip() vector0 = model.get_word_vector(query0) vector1 = model.get_word_vector(query1) vectors[query0] = vector0 vectors[query1] = vector1 dataset, correlation, oov = compute_similarity(None, test, vectors) correlation = np.around(correlation) self.assertTrue(correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation)) self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov)) self.assertEqual(path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size))
def train_unsup(): # train_file = open(fileConfig.dir_fasttext + fileConfig.file_fasttext_train_data, mode='r', encoding='utf-8') # train_lines = [] # for line in train_file: # train_lines.append(line) print("start train unsupervied fasttext model") model = fastText.train_unsupervised( input=fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data, model=fasttextConfig.choose_model, dim=128, minCount=3, wordNgrams=7, minn=2, maxn=6, lr=0.1, thread=8, epoch=25, loss='hs') model.save_model( fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.choose_model))
def train_fasttext(corpus, vocabulary, zero_init_indices=0, rand_init_indices=1, embedding_dim=300): """Use fasttext to train on corpus to obtain embedding Args: corpus: list of tokenized texts, corpus to train on vocabulary: dict, a mapping of words to indices zero_init_indices: int or a list, the indices which use zero-initialization. These indices usually represent padding token. rand_init_indices: int or a list, the indices which use randomly-initialization.These indices usually represent other special tokens, such as "unk" token. embedding_dim: int, dimensionality of embedding Returns: np.array, a word embedding matrix. """ corpus_file_path = 'fasttext_tmp_corpus.txt' with open(corpus_file_path, 'w', encoding='utf8') as writer: for sentence in corpus: writer.write(' '.join(sentence) + '\n') model = train_unsupervised(input=corpus_file_path, model='skipgram', epoch=10, minCount=1, wordNgrams=3, dim=embedding_dim) model_vocab = model.get_words() word_vectors = dict((w, model.get_word_vector(w)) for w in model_vocab) emb = filter_embeddings(word_vectors, embedding_dim, vocabulary, zero_init_indices, rand_init_indices) os.remove(corpus_file_path) return emb
mysim = [] gold = [] with open(data_path, 'rb') as fin: for line in fin: tline = line.split() word1 = tline[0].lower() word2 = tline[1].lower() v1 = model.get_word_vector(word1) v2 = model.get_word_vector(word2) d = similarity(v1, v2) mysim.append(d) gold.append(float(tline[2])) corr = stats.spearmanr(mysim, gold) dataset = os.path.basename(data_path) correlation = corr[0] * 100 return dataset, correlation, 0 if __name__ == "__main__": model = train_unsupervised( input=os.path.join(os.getenv("DATADIR", ''), 'fil9'), model='skipgram', ) model.save_model("fil9.bin") dataset, corr, oov = compute_similarity('rw.txt') print("{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)".format(dataset, corr, 0))
help="Output model filename", ) args = parser.parse_args() input_filename = args.input output_filename = args.output model = train_unsupervised( input=input_filename, model=model, lr=lr, dim=dim, ws=ws, epoch=epoch, minCount=minCount, minCountLabel=minCountLabel, minn=minn, maxn=maxn, neg=neg, wordNgrams=wordNgrams, loss=loss, bucket=bucket, thread=thread, lrUpdateRate=lrUpdateRate, t=t, verbose=verbose, ) model.save_model(output_filename)
return np.dot(v1, v2) / n1 / n2 mysim = [] gold = [] with open(data_path, 'rb') as fin: for line in fin: tline = line.split() word1 = tline[0].lower() word2 = tline[1].lower() v1 = model.get_word_vector(word1) v2 = model.get_word_vector(word2) d = similarity(v1, v2) mysim.append(d) gold.append(float(tline[2])) corr = stats.spearmanr(mysim, gold) dataset = os.path.basename(data_path) correlation = corr[0] * 100 return dataset, correlation, 0 if __name__ == "__main__": model = train_unsupervised( input=os.path.join(os.getenv("DATADIR", ''), 'fil9'), model='skipgram', ) dataset, corr, oov = compute_similarity('rw.txt') print("{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)".format(dataset, corr, 0))
if __name__ == '__main__': import fastText from data_utils.constants import ALL_TEXTS, WORD_VEC_PATH model = fastText.train_unsupervised(ALL_TEXTS, model='cbow', lr=0.05, dim=300, ws=5, epoch=50, minCount=5, maxn=0) model.save_model(WORD_VEC_PATH)
print("CWD: {}".format(Path.cwd())) print("Creating temporary file.") temp_file = NamedTemporaryFile(mode="w+", delete=False) # type: TextIOWrapper print("Temporary file at: {}".format(temp_file.name)) print("Loading all programs' tokens") connection = sqlite3.connect(str(database_path)) cursor = connection.cursor() rows = cursor.execute("SELECT tokens FROM tagger").fetchall() print("Loading programs into temporary file") for sentence in rows: sentence = json.loads(sentence[0]) for token in sentence: # type: str temp_file.write(" " + token) temp_file.seek(0) print("Make fast-text model.") model = fastText.train_unsupervised( input=temp_file.name, #lr=0.1, epoch=500, minCount=4, model="skipgram", thread=18) # Save model model.save_model(str(Path(storage_folder, "model.bin")))
import fastText as ft def make_wordsfile(document_list): """ fastText学習用にテキストデータを作成する :param document_list: 文書リスト :return: """ model_text = "" for document in document_list["news"]: model_text += " ".join(document) + "\n" w = open("./model/for_papers/dataset.txt", "w") w.write(model_text) w.close() model = ft.train_unsupervised(input="../model/for_papers/dataset.txt", model="skipgram", dim=200, ws=10, minCount=20, loss="ns", neg=10, epoch=25, thread=40, wordNgrams=1, t=1e-3) model.save_model("./model/for_papers/fastText.model") # model = ft.train_unsupervised(input="./model/for_papers/dataset.txt", model="skipgram", dim=200, ws=10, # minCount=20, loss="ns", neg=10, epoch=25, thread=40, wordNgrams=1, minn=3, maxn=6, t=1e-3) # model = ft.train_unsupervised(input="./data/fastText2013to2017_dataSet_2.txt", model="skipgram", dim=200, ws=10, # minCount=20, loss="ns", neg=10, epoch=25, thread=40, wordNgrams=2) # # model = ft.train_unsupervised(input="./data/fastText2013to2017_dataSet.txt", model="skipgram", dim=100, ws=10, minCount=20, epoch=25, thread=4, wordNgrams=2) # model.save("./model/fastText2013to2017.model")
import fastText output = fastText.train_unsupervised("walk.txt", model='skipgram',lr = 0.005, dim=100, ws=15, epoch=20, minCount=0, minCountLabel=0, minn=0, maxn=0, neg=10, wordNgrams=1, loss='ns', bucket=2000000, thread=4, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='') words = output.get_words() output.save_model("embeddings_weights.bin") f = open("embeddings2_weights.txt","w+") for word in words: v = output.get_word_vector(word) vstr = "" for vi in v: vstr += " " + str(vi) f.write(word + vstr+ "\n") f.close()
# 预测 model = load_model("./cooking.bin") print(model.predict("Which baking dish is best to bake a banana bread ?")) print(model.predict("Why not put knives in the dishwasher?")) # (('__label__baking',), array([0.35784602])) # (('__label__equipment',), array([0.39477548])) model = load_model("./cooking.ftz") print(model.predict("Which baking dish is best to bake a banana bread ?")) print(model.predict("Why not put knives in the dishwasher?")) # (('__label__bread',), array([0.32475984])) # (('__label__equipment',), array([0.49320737])) # 无监督学习 model = train_unsupervised(input=train_data, model='skipgram') model.save_model("cooking_uns.bin") # Read 0M words # Number of words: 2408 # Number of labels: 735 # Progress: 100.0% words/sec/thread: 82933 lr: 0.000000 loss: 2.764836 ETA: 0h 0m # 查看词向量 model = load_model("cooking_uns.bin") print("banana:", model.get_word_vector("banana")) print("apple:", model.get_word_vector("apple")) # 数据量太大,格式如下 # banana: [-1.87938347e-01 -4.34164740e-02 1.01463743e-01 -9.05684754e-02 ...] # apple: [-1.83095217e-01 -4.92684692e-02 1.06943615e-01 -8.55036154e-02 ...] # 查看label词频
import os import sys import pandas as pd import util.fast_text as ft from fastText import train_unsupervised if __name__ == "__main__": dataset = "data/unsupervised/seqlist_data_set.tsv" model_name = "model/rnasequences2vec.bin" print("Train...", dataset) model = train_unsupervised(input=dataset, model='cbow', lr=0.01, dim=200, wordNgrams=4, minCount=1, epoch=10) print("Save...") model.save_model(model_name) print("Create .vec file...") ft.bin_to_vec(model_name)