def train_embedding_fasttext(): ''' 使用fasttext 生成字符级别和单词级别的词嵌入 :return: ''' # Skipgram model logging.info( 'generating CHAR embedding %s with fasttext using %s algorithm', 'char2vec_fastskip256', 'Skipgram') model = fasttext.skipgram(CHAR_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'char2vec_fastskip256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model) # CBOW model logging.info( 'generating CHAR embedding %s with fasttext using %s algorithm', 'char2vec_fastcbow256', 'CBOW') model = fasttext.cbow(CHAR_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'char2vec_fastcbow256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model) # Skipgram model logging.info( 'generating WORD embedding %s with fasttext using %s algorithm', 'word2vec_fastskip256', 'Skipgram') model = fasttext.skipgram(WORD_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'word2vec_fastskip256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model) # CBOW model logging.info( 'generating WORD embedding %s with fasttext using %s algorithm', 'word2vec_fastcbow256', 'CBOW') model = fasttext.cbow(WORD_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'word2vec_fastcbow256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model)
def execute(): # Verify that mandatory arguments are present if "-i" not in args: return "ERROR: No input file was given" if "-t" not in args: return "ERROR: No model type was given" # Extract arguments train_file = args[args.index("-i")+1] model_type = args[args.index("-t")+1] # Extract optional arguments epoch = get_optional_param('--epoch',5) ngrams = get_optional_param('--ngrams',1) label_prefix = get_optional_param('--label','__label__') # Create temporary file tmp, modelname = tempfile.mkstemp() # Use specified classifier with parameters and output model to the name of the temporary file if model_type == "supervised": classifier = fasttext.supervised(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) elif model_type == "skipgram": classifier = fasttext.skipgram(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) elif model_type == "cbow": classifier = fasttext.cbow(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) # Return the temporary file name return modelname
def train(self, txt_path, config=DEF_CONFIG): if self.mode == "skipgram": self.model = fasttext.skipgram(txt_path, self.model_path, **config) elif self.mode == "cbow": self.model = fasttext.cbow(txt_path, self.model_path, **config) elif self.mode == "supervised": self.model = fasttext.supervised(txt_path, self.model_path, **config)
def train_embedding_fasttext(): # Skipgram model model = fasttext.skipgram(model_dir + 'train_char.txt', model_dir + 'char2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model) # CBOW model model = fasttext.cbow(model_dir + 'train_char.txt', model_dir + 'char2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model) # Skipgram model model = fasttext.skipgram(model_dir + 'train_word.txt', model_dir + 'word2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model) # CBOW model model = fasttext.cbow(model_dir + 'train_word.txt', model_dir + 'word2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model)
def trainCBOW(inFile,mdlfile): # CBOW model model = fasttext.cbow(input_file=inFile,\ output=mdlfile,\ lr=0.1,\ dim=200,\ epoch=10,\ word_ngrams=3,\ bucket=5000000)
def fasttext_model_from_file(file_path): save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1]) try: model = fasttext.load_model(save_file_name + '.bin', encoding='utf-8') logging.info('model loaded:' + save_file_name) except ValueError: model = fasttext.cbow(file_path, const.FASTTEXT_PREFIX + file_path.split('/')[-1], encoding='utf-8', min_count=1, lr=0.1) return model
def text2vec_fast(data_file, method='cbow', modelname='model', **kargs): clean_data(data_file, 'cleaned.txt') if method == 'skipgram': model = fasttext.skipgram('cleaned.txt', modelname, **kargs) else: model = fasttext.cbow('cleaned.txt', modelname, **kargs) vector_dict, index_dict = extract_word_vectors(modelname + '.vec') os.remove('cleaned.txt') return vector_dict, index_dict
def run_model(db_path, model_path, model_type, model_params): print('Training FastText model...') start_time = clock() conn = sqlite3.connect(db_path) cursor = conn.cursor() if not os.path.isdir(model_path): os.makedirs(model_path) temp_fpath = os.path.join(model_path, '__ft_temp.txt') get_text_command = 'SELECT preprocessed FROM rss_data ORDER BY cachedate DESC' results = cursor.execute(get_text_command) with open(temp_fpath, 'w', newline='', encoding='utf-8') as tmpfile: for r in results: if r[0] is not None: tmpfile.write(r[0] + '.\n') conn.close() model_fpath = os.path.join(model_path, 'fasttext_model') model = None p_ngrams = model_params.get('ngram', 1) p_dim = model_params.get('dim', 100) p_ws = model_params.get('ws', 5) p_epoch = model_params.get('epoch', 5) p_loss = model_params.get('loss', 'ns') p_min_count = model_params.get('min_count', 5) p_silent = model_params.get('silent', 1) if model_type == 'cbow': model = fasttext.cbow(temp_fpath, model_fpath, word_ngrams=p_ngrams, dim=p_dim, ws=p_ws, epoch=p_epoch, loss=p_loss, silent=p_silent, min_count=p_min_count) else: model = fasttext.skipgram(temp_fpath, model_fpath, word_ngrams=p_ngrams, dim=p_dim, ws=p_ws, epoch=p_epoch, loss=p_loss, silent=p_silent, min_count=p_min_count) os.remove(temp_fpath) end_time = clock() print('Model trained in {} seconds.'.format(int(end_time - start_time))) return model
def load_pretrained_fasttext(): # Set FastText home to the path to the FastText executable ft_home = '/home/dev/fastText/fasttext' # Set file names for train and test data train_file = config.pos_path try: # Use FaceBook Corpus print(help(fasttext)) fasttext.cbow( '/home/dev/tensormsa_jupyter/chap05_nlp/wordembedding/data/test3.txt', 'model') model = gft.FastText.load_fasttext_format( '/home/dev/tensormsa_jupyter/chap05_nlp/wordembedding/data/test3.model' ) result = model.most_similar(positive=['마법']) return result except Exception as e: raise Exception(e)
def fasttext_vectoriser_cbow(data): model = fasttext.cbow(data, 'model', dim=200) n_sample_size, n_features = size_of_vector_and_n_features_finder(df, model) vector_list = [] for word in model.words: vector_list.append(model[word]) myarray = np.asarray(vector_list) logger.info("previous array shape %s", myarray.shape) myarray = myarray.reshape(n_sample_size, n_features) logger.info("reshaped array %s", myarray.shape) return myarray
def train(inp = "wiki2.he.text",out_model = "wiki.he.fasttext.model", alg = "CBOW"): start = time.time() if alg == "skipgram": # Skipgram model model = fasttext.skipgram(inp, out_model) print(model.words) # list of words in dictionary else: # CBOW model model = fasttext.cbow(inp, out_model) print(model.words) # list of words in dictionary print(time.time()-start) model.save(out_model)
def create_embeddings(self, method='skipgram', data=None, overwrite=False, dim=128, context_size=5, epochs=10, min_count=5): """Train word embeddings using fasttext Args: method: Either skipgram or cbow data: Examples to use to get embeddings overwrite: Whether or not to overwrite existing embeddings file dim: Size of word vectors context_size: Size of the context window epochs: Number of epochs min_count: Minimal number of word occurences to have a vector """ if data is None: data = self.data if method not in ['skipgram', 'cbow']: raise ValueError('Method must be skipgram or cbow') output_name = os.path.join(SAVE_LOCS['embeddings'], self.name) if overwrite or (not os.path.isfile(self.ft_model_file)): self._fasttext_preprocess(data) if method == 'skipgram': fasttext.skipgram(self.ft_input_file, output_name, dim=dim, ws=context_size, epoch=epochs, min_count=min_count) else: fasttext.cbow(self.ft_input_file, output_name, dim=dim, ws=context_size, epoch=epochs, min_count=min_count) self.load_fasttext_model()
def fasttext_model(tweets, model="skipgram", tweets_col="clean_tweet", label_col="label"): create_labeled_csv(tweets, tweets_col, label_col) # Skipgram model if model == "skipgram": model = fasttext.skipgram("data/labels_train.txt", 'model_skipgram') #print(model.words) # list of words in dictionary elif model == "cbow": model = fasttext.cbow("data/labels_train.txt", 'model_cbow') #print(model.words) # list of words in dictionary return model
def load_fast_text_model(sentences): try: m = fasttext.load_model('fast_text_model.bin') print("trained model loaded") return m except: print("traning new model") with open('temp_file.txt','w') as temp_file: for sentence in sentences: temp_file.write(sentence) m = fasttext.cbow('temp_file.txt','fast_text_model') remove('temp_file.txt') print('model trained') return m
def makeModel(self, method): print('Creating ' + method + ' model...') time_begin = datetime.datetime.now() if method == 'skip' and not os.path.isfile(self.fileModelS + '.vec'): model = fasttext.skipgram(self.fileData, self.fileModelS, lr=0.2, lr_update_rate=100, dim=600, ws=7, epoch=5, min_count=3, neg=5, word_ngrams=3, bucket=2000000, minn=3, maxn=6, thread=12, t=0.0001) self.modelSkip = model else: if method == 'cbow' and not os.path.isfile(self.fileModelC + '.vec'): model = fasttext.cbow(self.fileData, self.fileModelC, lr=0.2, lr_update_rate=100, dim=600, ws=7, epoch=5, min_count=3, neg=5, word_ngrams=3, bucket=2000000, minn=3, maxn=6, thread=12, t=0.0001) self.modelCbow = model time_end = datetime.datetime.now() ## delete BIN file (not util for this project) if os.path.exists(self.fileModelC + '.bin'): os.remove(self.fileModelC + '.bin') if os.path.exists(self.fileModelS + '.bin'): os.remove(self.fileModelS + '.bin') print("Finished Training %s in %s" % (method, str(time_end - time_begin)))
def createClassifier(self): """ 构造并训练fasttext模型,保存到'/classifier.model.bin'中 Return: """ dataSetCreator = DataSetCreator() if not os.path.exists(self.saveVectorFile + '.vec'): fasttext.cbow(self.combinedDataSaveFile, self.saveVectorFile, dim=self.w2vdim, epoch=self.trainw2vepoch) if not os.path.exists(self.trainClassifierFile): dataSetCreator.dataSetforClassify(self.trainClassifierFile) self.classifier = fasttext.supervised( self.trainClassifierFile, self.classifierModel, dim=self.w2vdim, epoch=self.trainClassifierEpoch, label_prefix='__label__', pretrained_vectors=self.saveVectorFile + '.vec')
def test_train_cbow_model(self): # set params lr=0.005 dim=10 ws=5 epoch=5 min_count=1 neg=5 word_ngrams=1 loss='ns' bucket=2000000 minn=3 maxn=6 thread=4 lr_update_rate=10000 t=1e-4 silent=1 # train cbow model model = ft.cbow(input_file, output, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) # Make sure the model is generated correctly self.assertEqual(model.dim, dim) self.assertEqual(model.ws, ws) self.assertEqual(model.epoch, epoch) self.assertEqual(model.min_count, min_count) self.assertEqual(model.neg, neg) self.assertEqual(model.loss_name, loss) self.assertEqual(model.bucket, bucket) self.assertEqual(model.minn, minn) self.assertEqual(model.maxn, maxn) self.assertEqual(model.lr_update_rate, lr_update_rate) self.assertEqual(model.t, t) # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin')) self.assertTrue(path.isfile(output + '.vec')) # Make sure the vector have the right dimension self.assertEqual(len(model['the']), dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) self.assertTrue(unicode_str in model) self.assertEqual(len(model[unicode_str]), model.dim)
def train_fasttext_model(): # Skipgram model # equals to: `./fasttext skipgram -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_sg` model_sg = fasttext.skipgram("../data/880w_news_title_content_seg_sort_uniq_head_2.txt", "../data/lxw_model_sg") # 自动生成文件../data/lxw_model_sg.bin 和 ../data/lxw_model_sg.vec print(model_sg.words) # list of words in dictionary # CBOW model # equals to: `./fasttext cbow -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_sg` model_cbow = fasttext.cbow("../data/880w_news_title_content_seg_sort_uniq_head_2.txt", "../data/lxw_model_cbow") # 自动生成文件../data/lxw_model_cbow.bin 和 ../data/lxw_model_cbow.vec print(model_cbow.words) # list of words in dictionary print(type(model_cbow.words)) # <class 'set'> # NOTE: 生成的两个.vec文件针对同一个词的向量是不同的 print("intersection:{}".format(model_sg.words - model_cbow.words)) # intersection:set() print("intersection:{}".format(model_cbow.words - model_sg.words)) # intersection:set()
def char_ft_main_more(): import fasttext embed_file = 'data/char_embed_corpus.txt' for windows in [3, 5, 8]: out_file = 'data/char_ft_embed_{}_{}.model'.format(windows, 1) model = fasttext.skipgram(embed_file, out_file, ws=windows, lr=0.1, dim=300, silent=0) out_file = 'data/char_ft_embed_{}_{}.model'.format(windows, 0) model = fasttext.cbow(embed_file, out_file, ws=windows, lr=0.1, dim=300, silent=0) print('done.')
def test_train_cbow_model_default(self): default_args = default_params.read_file(params_txt) model = ft.cbow(input_file, output) # Make sure the default params of cbow is equal # to fasttext(1) default params self.assertEqual(model.model_name, 'cbow') self.assertEqual(model.dim, int(default_args['dim'])) self.assertEqual(model.ws, int(default_args['ws'])) self.assertEqual(model.epoch, int(default_args['epoch'])) self.assertEqual(model.min_count, int(default_args['minCount'])) self.assertEqual(model.neg, int(default_args['neg'])) self.assertEqual(model.word_ngrams, int(default_args['wordNgrams'])) self.assertEqual(model.loss_name, default_args['loss']) self.assertEqual(model.bucket, int(default_args['bucket'])) self.assertEqual(model.minn, int(default_args['minn'])) self.assertEqual(model.maxn, int(default_args['maxn'])) self.assertEqual(model.lr_update_rate, float(default_args['lrUpdateRate'])) self.assertEqual(model.t, float(default_args['t']))
def train(self, input_filename=SENTENCES_FILENAME, model_filename=None, model_type='skipgram'): """Train a fasttext model. Parameters ---------- input_filename : str, optional Filename for input file with sentences. model_filename : str, optional Filename for model output. model_type : skipgram or cbow, optional Model type. """ if model_filename is None: if model_type == 'skipgram': model_filename = FAST_TEXT_SKIPGRAM_MODEL_FILENAME elif model_type == 'cbow': model_filename = FAST_TEXT_CBOW_MODEL_FILENAME full_model_filename = self.full_filename(model_filename) full_input_filename = self.full_filename(input_filename) if model_type == 'skipgram': self.logger.info( 'Training fasttext skipgram model on {} to {}'.format( full_input_filename, full_model_filename)) self.model = fasttext.skipgram( full_input_filename, full_model_filename) elif model_type == 'cbow': self.logger.info( 'Training fasttext cbow model on {} to {}'.format( full_input_filename, full_model_filename)) self.model = fasttext.cbow( full_input_filename, full_model_filename) else: raise ValueError('Wrong argument to model_type') # Invalidate computed normalized matrix self._normalized_matrix = None
def train(self): if self.sg: self.model = skipgram(self.src_file, self.model_file, dim=self.dim, ws=self.ws, min_count=self.min_count, lr=self.lr, lr_update_rate=self.lr_update_rate, epoch=self.epoch, word_ngrams=self.word_ngrams, thread=self.thread) else: self.model = cbow(self.src_file, self.model_file, dim=self.dim, ws=self.ws, min_count=self.min_count, lr=self.lr, lr_update_rate=self.lr_update_rate, epoch=self.epoch, word_ngrams=self.word_ngrams, thread=self.thread)
def train_model(self, file_input='train_text.txt'): """ :param file_input: :return: """ input_path = os.path.join(DATA_DIR, file_input) if self.embedding_type == 'skipgram': # Skipgram model model = fasttext.skipgram(input_path, self.model_path) elif self.embedding_type == 'cbow': # CBOW model model = fasttext.cbow(input_path, self.model_path) else: raise NotImplementedError() print(model.words) # list of words in dictionary self.model = model return model
# -*- coding: utf-8 -*- """ Created on Sat Sep 23 12:38:21 2017 @author: nabeel test classification with fast text """ import fasttext # Skipgram model model = fasttext.skipgram('data.txt', 'model') print model.words # list of words in dictionary # CBOW model model = fasttext.cbow('data.txt', 'model') print model.words # list of words in dictionary print model['king'] # get the vector of the word 'king' model = fasttext.load_model('model.bin') print model.words # list of words in dictionary print model['king'] # get the vector of the word 'king' classifier = fasttext.supervised('data.train.txt', 'model') classifier = fasttext.supervised('data.train.txt', 'model', label_prefix='__label__') result = classifier.test('test.txt') print 'P@1:', result.precision print 'R@1:', result.recall print 'Number of examples:', result.nexamples texts = ['example very long text 1', 'example very longtext 2']
def train(self): return fasttext.cbow(self.train_file, self.model_path)
4: 'military', 5: 'sports' } texts = [ '中新网 日电 2018 预赛 亚洲区 强赛 中国队 韩国队 较量 比赛 上半场 分钟 主场 作战 中国队 率先 打破 场上 僵局 利用 角球 机会 大宝 前点 攻门 得手 中国队 领先' ] labels = classifier.predict(texts) print(labels) print(label_to_cate[int(labels[0][0])]) #Top K 个预测结果 labels = classifier.predict(texts, k=3) print(labels) import fasttext # Skipgram model model = fasttext.skipgram('unsupervised_train_data.txt', 'model') print(model.words) # CBOW model model = fasttext.cbow('unsupervised_train_data.txt', 'model') print(model.words) #对比gensim的word2vec model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) model.save("gensim_word2vec.model") model.wv['赛季'] model.wv.most_similar('赛季')
self.n_thread=th self.samplet=t self.silent=verbosec self.enc=encoding def fit(self,X,modelname='model'): """ Input: takes input file in format returns classifier object to do: add option to feed list of X and Y or file to do: check options for the api call to do: write unit test """ try: if not csvflag: self.model=ft.cbow(X, modelname, lr=self.lr, dim=self.dim,lr_update_rate=self.lr_update_rate,epoch=self.epoch,bucket=self.bucket,loss=self.loss,thread=self.n_thread) except: print("Error in input dataset format") def getproperties(self): """ Input: Nothing, other than object self pointer Return: None , prints the descriptions of the model hyperparameters """ print("The model has following hyperparameters as part of its specification") print("Learning rate :"+ str(lr)) print("Learning rate update after "+str(self.lr_update_rate)+"iterations") print("Embedding size: "+str(self.dim)) print("Epochs :"+ str(self.epochs) print("minimal number of word occurences: "+self.min_count) print("number of negatives sampled :"+str(self.neg))
OUTPUT_PATH_CBOW = '/tmp/cbow' # Learn the word representation using skipgram model #skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH_SKIPGRAM, lr=0.02, dim=300, ws=5, \ # epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, \ # thread=4, t=1e-4, lr_update_rate=100) skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH_SKIPGRAM) # Get the vector of some word print(skipgram['word']) # Learn the word representation using cbow model #cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH_CBOW, lr=0.02, dim=300, ws=5, \ # epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, \ # thread=4, t=1e-4, lr_update_rate=100) cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH_CBOW) # Get the vector of some word print(cbow['word']) # Load pre-trained skipgram model SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin' skipgram = fasttext.load_model(SKIPGRAM_BIN) print(skipgram['word']) # Load pre-trained cbow model CBOW_BIN = OUTPUT_PATH_CBOW + '.bin' cbow = fasttext.load_model(CBOW_BIN) print(cbow['word'])
def buildEmbedding(): textList = getTextList(preTrainDF, preTestDF, semiTrainDF, semiTestDF) with codecs.open('./dic&corp/text.txt', 'w', 'utf-8') as fw: for text in textList: fw.write(text + '\n') model = fasttext.cbow('./dic&corp/text.txt', 'model.cbow', dim=128)
INPUT_TXT = '/path/to/file.txt' OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram' OUTPUT_PATH_CBOW = '/tmp/cbow' # Learn the word representation using skipgram model skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) # Get the vector of some word print skipgram['word'] # Learn the word representation using cbow model cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) # Get the vector of some word print cbow['word'] # Load pre-trained skipgram model SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin' skipgram = fasttext.load_model(SKIPGRAM_BIN) print skipgram['word'] # Load pre-trained cbow model CBOW_BIN = OUTPUT_PATH_CBOW + '.bin' cbow = fasttext.load_model(CBOW_BIN) print cbow['word']
fonm = open('ftext_name.txt', 'w') for nm, ct, ds in zip(merge.name_token.str.lower(), merge.category_name_split.str.lower(), merge.description_token.str.lower()): fonm.write( '%s %s %s\n' % (ct.encode('ascii', 'ignore').lower(), ds.encode( 'ascii', 'ignore').lower(), nm.encode('ascii', 'ignore').lower())) fonm.close() print('[{}] Start fasttext training'.format(time.time() - start_time)) model = fasttext.cbow('ftext_name.txt', 'model', dim=24, ws=4, lr=.05, min_count=1, thread=8, epoch=4, silent=0) modelcb = FastText('model.bin') print('[{}] Start fasttext mat creation'.format(time.time() - start_time)) ftmat = np.zeros((merge.shape[0], 24)) for c, vals in tqdm(enumerate(merge[['category_name', 'name']].values)): ftmat[c] = modelcb.get_numpy_sentence_vector( '%s %s' % (vals[0].replace('/', ' '), vals[1])) ftmat = pd.DataFrame(ftmat) print('[{}] Finished fasttext mat creation'.format(time.time() - start_time)) ftmat.head() '''