def train(self, trainingfile): """Starts model building""" logger.info( f'Training started with : learningRate:{self.config.learningRate!s}, epoch:{self.config.epoch!s}, ngrams :{self.config.ngrams!s}' ) model = FastText() if self.supervised: model.supervised(input=trainingfile, output=self.filepath, epoch=self.config.epochs, lr=self.config.learningRate, wordNgrams=self.config.ngrams, verbose=2, minCount=1) elif self.config.method == "cbow": model.cbow(input=trainingfile, output='model', epoch=self.config.epoch, lr=self.config.learningRate) else: model.skipgram(input=trainingfile, output='model', epoch=self.config.epoch, lr=self.config.learningRate)
def train_pyfasttext_model(): # Skipgram model model_sg = FastText() # equals to: `./fasttext skipgram -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_sg_pyfasttext` model_sg.skipgram( input="../data/880w_news_title_content_seg_sort_uniq_head_2.txt", output="../data/lxw_model_sg_pyfasttext") # 自动生成文件../data/lxw_model_sg_pyfasttext.bin 和 ../data/lxw_model_sg_pyfasttext.vec print(model_sg.words) # list of words in dictionary # CBOW model model_cbow = FastText() # equals to: `./fasttext cbow -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_cbow_pyfasttext` model_cbow.cbow( input="../data/880w_news_title_content_seg_sort_uniq_head_2.txt", output="../data/lxw_model_cbow_pyfasttext") # 自动生成文件../data/lxw_model_cbow_pyfasttext.bin 和 ../data/lxw_model_cbow_pyfasttext.vec print(model_cbow.words) # list of words in dictionary print(type(model_cbow.words)) # <class 'list'>
# for word in skip_gram_model.words: # print(word, skip_gram_model[word]) print(skip_gram_model.nearest_neighbors('贷款', k=2)) # test data is stored inside a file, use this: # skip_gram_model.predict_proba_file('./test.txt', k=2) print("\n") ################## # 使用cbow模型训练 # ################## cbow_model = FastText() cbow_model.cbow(input='./train.txt', output='cbow_model', epoch=100, lr=0.7) print(cbow_model['贷款']) # print(cbow_model.get_numpy_vector('贷款')) # print(cbow_model.get_numpy_vector('贷款', normalized=True)) var1 = cbow_model.get_numpy_vector('人民币') var2 = cbow_model.get_numpy_vector('贷款') var3 = cbow_model.get_numpy_vector('外币') cbow_model.words_for_vector(var1 + var2 - var3, k=1) # for word in cbow_model.words: # print(word, cbow_model[word]) print(cbow_model.nearest_neighbors('贷款', k=2)) # test data is stored inside a file, use this: