def train_fasttext_win(inputPath='news_fasttext/news_fasttext_train.txt', savePath='model.m', label='__label__'): if not os.path.exists(savePath): print('train model...') classifier = ff.train_supervised(inputPath, label=label) classifier.save_model(savePath) # 保存模型 else: classifier = ff.load_model('model.m') # 读取模型 print('loaded model...') return classifier
def train_fasttext(inputPath='train.txt', savePath='./model.m', label='__label__'): if not os.path.exists(savePath): print('train...') classfication = ft.train_supervised(inputPath, label=label) classfication.save_model(savePath) else: classfication = ft.load_model(savePath) print('load model...') return classfication
def create_predict(HudongItem_csv): # 读取neo4j内容 db = Neo4j() db.connectDB() predict_List = readCSVbyColumn(HudongItem_csv, 'title') file_object = open('vector.txt', 'a') model = FastText('wiki.zh.bin') count = 0 vis = set() for p in predict_List: cur = HudongItem(db.matchHudongItembyTitle(p)) count += 1 title = cur.title if title in vis: continue vis.add(title) wv_list = model[title] strr = str(title) print('===============') print(strr) for p in wv_list: strr += ' ' + str(p)[:7] print('===============') print(strr) print('===============') file_object.write(strr + "\n") print(str(count) + ' / ' + str(len(predict_List))) file_object.close()
def test(): ''' ''' source = 'jieba_v3_ft_model_2.ft' target = 'jieba_test_fasttext_v3.txt' clf = FT.load_model(os.path.join(const.MODELPATH, source)) size, precision, recall = clf.test(os.path.join(const.DATAPATH, target)) print('accuracy score : ', precision)
def get_fasttext_tweet_embedding( tweet_word_list: List[str], fasttext_model: fastText.FastText) -> np.ndarray: all_embeddings = [] for word in tweet_word_list: all_embeddings.append(fasttext_model.get_word_vector(word)) return np.mean(all_embeddings, axis=0)
def train_classifier(self): # self.load_train_dataset() #实验后的最佳参数之一 start_time = time.time() classifier = ff.train_supervised( self.data_path, lr=0.1, loss='hs', wordNgrams=2, epoch=300) # epoch=20,0.91;epoch=50,0.93; model = classifier.save_model( self.model_save_path + 'level_2_fasttext_classifier_big_big.model' ) # 保存模型 all:0.91;all_2:0.93 classifier.get_labels() # 输出标签 # 测试模型 # print('加载fasttext模型--{}'.format('level_1_fasttext_classifier_big_test.model')) # classifier = ff.load_model(self.model_save_path+'level_1_fasttext_classifier_big_test.model') test_result = classifier.test(self.test_save_path + 'test_big.txt') result_str = 'test precision:{}\n'.format(test_result) print(result_str) end_time = time.time() load_time = round(end_time - start_time, 3) train_time_str = 'train and test model time %fs' % load_time print(train_time_str) save_file(self.result_save_path + 'fasttext_result_big.txt', result_str + train_time_str + '\n', 'a')
def mode_predict_fasttext(config, input_path): """使用训练好的fasttext模型预测 数据 """ input_path = os.path.join(input_path, 'data_predict.txt') ftmodel_path = os.path.join(config.log_root, config.model_name, "train", "ftmodel.bin") # 最终预测值的保存路径 output_path = os.path.join("output", "predict_result.txt") # 载入训练好的模型 ftmodel = fasttext.load_model(ftmodel_path) print('开始预测数据') with open(input_path, 'r', encoding='utf8') as fin: lines = fin.readlines() lines = [x.strip() for x in lines] # 去掉每行头尾空白 if config.is_python_package: # python版本只有 predict_proba 方法才能给出预测概率 pred_pairs = ftmodel.predict_proba(lines) predict_list = [x[0] for x in pred_pairs] else: # facebook的python版本是返回两个列表 pred_list, pred_prob_list = ftmodel.predict(lines) predict_list = [(pred[0], pred_prob[0]) for (pred, pred_prob) in zip(pred_list, pred_prob_list)] # 得到的predict_list的格式:[(预测类别, 预测概率), (预测类别, 预测概率)...] # 保存预测值到output文件夹 with open(output_path, "w", encoding="utf8") as fout: for (pred, pred_prob) in predict_list: pred = re.split("__", pred)[-1] # 去掉前缀__label__ fout.write("%s\t%f\n" % (pred, pred_prob)) print('预测完成,已将预测值写入输出文件:', output_path)
def load_fasttext(self, model_name): start_time = time.time() classifier = ff.load_model(self.model_save_path + model_name) end_time = time.time() pre_time = round(end_time - start_time, 3) print('加载fasttext模型时间: %f' % pre_time) return classifier
def pre_load_fastText_model(self): """ 预加载fastText模型 :return: """ os.chdir(os.path.dirname(os.path.abspath(__file__))) self.classifier = ff.load_model("model_w2_e24")
def train(): classifier = ft.train_supervised(train_path) model = classifier.save_model(model_path) test = classifier.test(test_path) print("准确率:", test.precision) print("回归率:", test.recall) classifier.get_labels()
def load_model(model_path): # 加载模型 # 加载windows模型 import fastText.FastText as ff classifier = ff.load_model(model_path) return classifier
def load_words_model(self, model_filename): model_filename = self.data_dir + model_filename self.model = fasttext.load_model(model_filename + '.bin') self.word_dim = self.model.get_dimension() self.words_list = list(self.model.get_words()) print('Loaded. Dictionary size:%s' % '{:,}'.format(len(self.model.get_words())))
def trainUnSpv(path: str, n=1, model='skipgram'): """ """ m = FT.train_unsupervised(path, epoch=100, dim=100, wordNgrams=n, model=model) return m
def fastText_classifier(train_data,model_save_path): classifier=ff.train_supervised(train_data+'book_sub_level_1_train.txt',lr=0.1,loss='hs',wordNgrams=2,epoch=300) model = classifier.save_model(model_save_path+'book_sub_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(train_data+'book_sub_level_1_train.txt') print(result) '''
def trainFT(path: str, n=1): """ """ clf = FT.train_supervised(path, epoch=100, dim=100, wordNgrams=n, label='__label__', loss='softmax') return clf
def fasttext_train(self): model_file = os.path.join(self.home_data, 'fasttext.model') if os.path.exists(model_file): print("Fasttext模型已经存在,直接载入") classifier = ft.load_model(self.home_data + 'fasttext.model') else: print("Fasttext模型不存在,训练") import fastText.FastText as ft classifier = ft.train_supervised(self.home_data+"fasttext_train.txt") # 训练模型 model = classifier.save_model(self.home_data+'fasttext.model') # 保存模型 classifier = ft.load_model(self.home_data+'fasttext.model') # 导入模型 result = classifier.test(self.home_data+"fasttext_test.txt") # 输出测试结果 labels = classifier.get_labels() # 输出标签 print("测试实例数", result[0]) # 实例数 print("准确率", result[1]) # 全部的准确率 print("召回率", result[2]) # 召回率 logging.info('测试实例数 %s' % str(result[0])) logging.info('准确率 %s' % str(result[1])) logging.info('召回率 %s' % str(result[2]))
def load_model(model_path, os_name): #加载模型 #加载windows模型 if os_name == 'nt': import fastText.FastText as ff classifier = ff.load_model(model_path) else: #加载linux模型 import fasttext classifier = fasttext.load_model(model_path, label_prefix='__label__') return classifier
def fast_text_model(X_test): ''' 使用fasttext进行文本分类 ''' # 分类训练 classifier = ff.train_supervised('train.txt', label='__label__') # 模型预测,返回预测标签和概率 label, prob = classifier.predict(X_test) # 根据给定数据集对模型进行评价,返回样本个数、准确率、召回率 result = classifier.test('test.txt') return label, prob, result
def mode_train_fasttext(config, _input_path): """训练fasttext模型并保存训练好的模型 """ input_path = os.path.join(_input_path, 'data_train.txt') output_path = os.path.join(config.log_root, config.model_name, "train", "ftmodel") pretrained_path = os.path.join(config.log_root, config.model_name, "train", "word_vector.vec") # 执行训练命令 print('开始训练fasttext模型') if config.is_python_package: # python版必须指定bucket, 未修复的bug.. bucket增加多少倍,保存的模型大小大致也会增加多少倍 fasttext.supervised(input_file=input_path, output=output_path, epoch=config.train_epoch, lr=config.learning_rate, dim=config.dim, word_ngrams=config.wordNgrams, loss=config.loss, pretrained_vectors=pretrained_path, bucket=200000) else: train_command = '{}fasttext supervised '.format(fasttext_path) + \ '-input {} -output {} '.format(input_path, output_path) + \ '-epoch {} -lr {} -dim {} -wordNgrams {} '.format(config.train_epoch, config.learning_rate, config.dim, config.wordNgrams)+ \ '-loss {} -verbose {} -pretrainedVectors {}'.format(config.loss, config.verbose, pretrained_path) # '-loss {} -verbose {}'.format(config.loss, config.verbose) os.system(train_command) print('训练fasttext模型完毕')
def test_fasttext(train_path, test_path, model_save_path): classifier = ff.train_supervised(train_path + 'A_train.txt', lr=0.1, loss='hs', wordNgrams=2, epoch=50) #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt') model = classifier.save_model(model_save_path + 'A_train_classifier2.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(test_path + 'A_test.txt') print(result)
def make_model(data_path, data_file_name, save_path, save_file_name, **kwargs): total_data_path = os.path.join(data_path, data_file_name) total_save_path = os.path.join(save_path, save_file_name) assert os.path.isfile(total_data_path) == True start_time = strftime("%y%m%d-%H%M%S") print('모델생산 시작시간 : ', start_time) try: if kwargs.keys() == None: model = FastText.train_unsupervised(total_data_path) else: model = FastText.train_unsupervised(total_data_path, **kwargs) model.save_vectors(total_save_path) except Exception as e: print(e) finally: end_time = strftime("%y%m%d-%H%M%S") print('모델생산 종료시간 : ', end_time) return total_save_path + '.vec'
def fasttext_model_train(): """ fasttext模型训练 :return: """ for i in range(5, 51): for w in range(1, 3): start_time = time.time() classifier = ff.train_supervised("fasttext.train", epoch=i, lr=0.5, wordNgrams=w) print("ngram=%d,训练第%d轮,用时%s" % (w, i, time.time() - start_time)) classifier.save_model("Model/model_w" + str(w) + "_e" + str(i))
def mode_evaluate_fasttext(config, input_path): """测试训练好的fasttext模型 """ input_path = os.path.join(input_path, 'data_test.txt') ftmodel_path = os.path.join(config.log_root, config.model_name, "train", "ftmodel.bin") # 执行测试命令 if config.is_python_package: ftmodel = fasttext.load_model(ftmodel_path) test_log = ftmodel.test(input_path) print('测试集的测试结果:\nN\t{}\nP@1\t{}\nR@1\t{}\n'.format(test_log.nexamples, test_log.precision, test_log.recall)) else: test_command = '{}fasttext test '.format(fasttext_path) + \ '{} {}'.format(ftmodel_path, input_path) test_log = os.popen(test_command) print('测试集的测试结果:\n{}\n'.format(test_log.read()))
def load_fastText_vectors_char(model, vocab): m = FastText.load_model(model) word_vecs = {} for subword in vocab: if subword not in word_vecs: word_vecs[subword] = m.get_word_vector(subword) # for subword in vocab: # if subword not in word_vecs: # index = m.get_subword_id(subword) # word_vecs[subword] = m.get_input_vector(index) return word_vecs
def fastText_classifier(train_data, model_save_path, result_save_path): classifier = ff.train_supervised(train_data + 'level_3_train.txt', lr=0.1, loss='hs', wordNgrams=2, epoch=150) model = classifier.save_model(model_save_path + 'level_3_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(train_data + 'level_3_train.txt') print(result) with open(result_save_path + 'train3_results.txt', 'w') as fp: fp.write(str(result)) fp.write('\n') '''
def pretrained_vectors(config, input_path): """预训练词向量 """ input_path = os.path.join(input_path, 'data_train.txt') output_path = os.path.join(config.log_root, config.model_name, "train", "word_vector") # 执行训练命令 print('开始预训练词向量(仅需要训练一次。一般来说需要一些时间..) ') if config.is_python_package: model = fasttext.skipgram(input_file=input_path, output=output_path, epoch=config.train_epoch, dim=config.dim) # print(model.words) else: train_command = '{}fasttext skipgram '.format(fasttext_path) + \ '-input {} -output {} '.format(input_path, output_path) + \ '-epoch {} -dim {} '.format(config.train_epoch, config.dim) os.system(train_command) print('预训练词向量完毕')
def fastText_classifier(train_data,test_data,model_save_path,result_save_path): files = [] results = [] if not os.path.exists(train_data): os.makedirs(train_data) if not os.path.exists(model_save_path): os.makedirs(model_save_path) if not os.path.exists(result_save_path): os.makedirs(result_save_path) for level_one,test_l1 in zip(os.listdir(train_data),os.listdir(test_data)): print(level_one+'-->'+test_l1) if '.txt' in level_one and '.txt' in test_l1: classifier=ff.train_supervised(train_data+level_one,、lr=0.1,loss='hs',wordNgrams=2,epoch=100) #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt') model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(test_data+test_l1) files.append(level_one) results.append(result) print(result) else: data_list = os.listdir(train_data+level_one+'/') test_list = os.listdir(test_data+test_l1+'/') if not len(data_list) or not len(test_data): continue classifier=ff.train_supervised(train_data+level_one+'/'+data_list[0],lr=0.1,loss='hs',wordNgrams=2,epoch=50) #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt') model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(test_data+test_l1+'/'+test_list[0]) files.append(data_list[0]) results.append(result) print(result) print(files) print(results) with open(result_save_path+'train_results.txt','w') as fp: for i,j in zip(files,results): fp.write(str(i)+'-->'+str(j)) fp.write('\n') '''
def sentence_input(sentence): classifier = ff.load_model("fastText/model/train") stop_words = load_stop_words() seg_line = jieba.cut(sentence) positive_emoticons = load_positive_emoticons() negative_emoticons = load_negative_emoticons() punctuation_words = load_punctuation_words() stop_words = load_stop_words() positive_words = load_positive_words() negative_words = load_negative_words() # print(sentence) # 识别其中的表情符,并替换成相应的字符 for p_word in positive_emoticons: if p_word in sentence: sentence = sentence.replace(p_word, " 积极 ") for p_word in negative_emoticons: if p_word in sentence: sentence = sentence.replace(p_word, " 消极 ") # print(sentence) # 识别其中的标点符号,替换成空格 for p_word in punctuation_words: if p_word in sentence: sentence = sentence.replace(p_word, " ") # print(sentence) # 对句子进行分词,去除停用词,识别情感词,并为情感词打上特殊标记 line = sentence.replace("蒙牛", "") seg_line = jieba.cut(line) # print(seg_line) add_str = "" for word in seg_line: if word not in stop_words and word != "" and " " not in word: if word in positive_words: add_str += "# " + word + " # " elif word in negative_words: add_str += "* " + word + " * " else: add_str += word + " " # print(add_str.strip()) label = classifier.predict([add_str.strip()], k=3) result = [(str(i), j) for i, j in zip(label[0], label[1])] print(result) return result
def get_predict(attrs, model_path, string): # str_list = ['我 要 一款 内存 大 运行 快 的 苹果 电脑'] str_list = [] result = '' for j in jieba.lcut(string): result = result + j + ' ' result = result.strip() str_list.append(result) print(str_list) attrs_predict = [] for attr in attrs: print(attr, '预测:') classifier = ff.load_model(model_path + attr + "_model") lab = classifier.predict(str_list, k=1) # print(lab) # (['__label__2'], array([0.95485353])) # print(lab[0][0].split('__')[2]) attrs_predict.append(lab[0][0].split('__')[2]) print(attrs_predict) return attrs_predict
def train_words_model(self, corpus_filename, model_filename, model='skpigram', min_count=5): corpus_filename = self.data_dir + corpus_filename model_filename = self.data_dir + model_filename print('Training for [%s] Model=%s Dim=%d MinCount=%d...' % (corpus_filename, model, self.word_dim, min_count)) self.model = fasttext.train_unsupervised(input=corpus_filename, model=model, dim=self.word_dim, minCount=min_count) self.model.save_model(model_filename) self.words_list = list(self.model.get_words()) print('Finished. Dictionary size:%s' % '{:,}'.format(len(self.model.get_words())))