def train_model(ipt=None, opt=None, model='', dim=128, epoch=5, lr=0.1, loss='softmax'): np.set_printoptions(suppress=True) if os.path.isfile(model): classifier = fasttext.load_model(model) else: classifier = fasttext.train_supervised(ipt, label='__label__', dim=dim, epoch=epoch, lr=lr, wordNgrams=2, loss=loss) classifier.save_model(opt) return classifier
def get_embeddings_with_timedeltas_per_block(data: defaultdict, model: fasttext.FastText) -> List: embeddings = [] for logs in data.values(): numpy_block = np.zeros(shape=(len(logs), model.get_dimension() + 1), dtype=np.float32) for i, log in enumerate(logs): numpy_block[i, 1:] = model.get_sentence_vector(log.rstrip()) numpy_block[:, 0] = get_timedeltas(logs) embeddings.append(numpy_block) return embeddings
def get_model(self): """获取模型""" model_path = config.fasttext_vectorizer_path if os.path.exists(model_path): model = FastText.load_model(model_path) else: recall_corpus = config.recall_corpus model = FastText.train_unsupervised(input=recall_corpus, minCount=1, wordNgrams=2, epoch=20) model.save_model(model_path) return model
def load_pretrained(name, word_to_ix, embed_size=128): import fasttext.FastText as fasttext model = fasttext.load_model(os.path.join(MODELPATH, '{}.bin'.format(name))) embedding_matrix = np.zeros((len(word_to_ix), embed_size)) for w, ix in word_to_ix.items(): embedding_matrix[ix] = model[w] return embedding_matrix
def train(): try: if not os.path.isfile(prepared_data): prepera_data(training_data_path, prepared_data) if not os.path.isfile(prepared_test): prepera_data(test_data_path, prepared_test) hyper_params = {"lr": 0.01, "epoch": 20, "wordNgrams": 2, "dim": 20} # Train the model. model = fastText.train_supervised(input=prepared_data, **hyper_params) print("Model trained with the hyperparameter \n {}".format(hyper_params)) # CHECK PERFORMANCE result = model.test(prepared_test) # DISPLAY ACCURACY OF TRAINED MODEL text_line = str(hyper_params) + ",accuracy:" + str(result[1]) + '\n' print(text_line) model.save_model(os.path.join(model_path,model_name + ".ftz")) except Exception as ex: print('Exception' + str(ex))
def train(self): classifier = ff.train_supervised( input=self.train_data_path, label="__label__", epoch=self.epoch, pretrainedVectors=self.pretrainedVectors, lr=self.lr, dim=self.dim) classifier.save_model(self.save_model_path) train_result = classifier.test(self.train_data_path) print("### TRAIN RESULT ###") print("Train Samples: {}".format(train_result[0])) print("Train Precision: {}".format(train_result[1])) print("Train Recall: {}\n\n".format(train_result[2])) if self.test_data_path: test_result = classifier.test(self.test_data_path) print("### TEST RESULT ###") print("Test Samples: {}".format(test_result[0])) print("Test Precision: {}".format(test_result[1])) print("Test Recall: {}\n\n".format(test_result[2])) print("model save to {}".format(self.save_model_path))
def __init__(self, save_model_path=None, train_data_path=None, test_data_path=None, type="train", k=1, threshold=0.0): """ fasttext 文本分类 :param train_data_path: 训练文本路径 :param save_model_path: 模型保存路径 :param test_data_path: 测试文本路径, 默认为 None :param type: 运行模式,“train/prediict” """ self.train_data_path = train_data_path self.test_data_path = test_data_path self.save_model_path = save_model_path self.type = type self.k = k self.threshold = threshold if self.type == "predict": if not os.path.exists(self.save_model_path): print("MODEL: {} is not EXIST ....") sys.exit() print("LOAD MODEL FROM: {}".format(self.save_model_path)) self.classifier = ff.load_model(self.save_model_path) else: self.classifier = None
def get_embeddings_per_log(data: defaultdict, model: fasttext.FastText) -> np.ndarray: # create embeddings per log but at first remove '\n' (newline character) from the end embeddings = [ model.get_sentence_vector(log.rstrip()) for logs in data.values() for log in logs ] return np.asarray(embeddings)
def train_model(ipt=None, opt=None, model='', dim=100, epoch=5, lr=0.1, loss='softmax'): np.set_printoptions(suppress=True) if os.path.isfile(model): classifier = fasttext.load_model(model) else: classifier = fasttext.train_supervised(ipt, label='__label__', dim=dim, epoch=epoch, lr=lr, wordNgrams=2, loss=loss) """ 训练一个监督模型, 返回一个模型对象 @param input: 训练数据文件路径 @param lr: 学习率 @param dim: 向量维度 @param ws: cbow模型时使用 @param epoch: 次数 @param minCount: 词频阈值, 小于该值在初始化时会过滤掉 @param minCountLabel: 类别阈值,类别小于该值初始化时会过滤掉 @param minn: 构造subword时最小char个数 @param maxn: 构造subword时最大char个数 @param neg: 负采样 @param wordNgrams: n-gram个数 @param loss: 损失函数类型, softmax, ns: 负采样, hs: 分层softmax @param bucket: 词扩充大小, [A, B]: A语料中包含的词向量, B不在语料中的词向量 @param thread: 线程个数, 每个线程处理输入数据的一段, 0号线程负责loss输出 @param lrUpdateRate: 学习率更新 @param t: 负采样阈值 @param label: 类别前缀 @param verbose: ?? @param pretrainedVectors: 预训练的词向量文件路径, 如果word出现在文件夹中初始化不再随机 @return model object """ classifier.save_model(opt) return classifier
def load_model(self, ): """ load fast-text model :return: """ if os.path.exists(self.model_path): self.classifier = FastText.load_model(self.model_path) else: print("no such model, train now") self.train_model()
def __init__(self): jieba.enable_paddle() f = open("data/stop_words_utf-8.txt", mode='r', encoding='utf-8') self.stop_words = [line.strip() for line in f.readlines()] f.close() print("停用词加载成功") self.classifier = fasttext.load_model( 'model/data_dim100_lr00.5_iter5.model') print("模型加载成功")
def get_embeddings_per_block(data: defaultdict, model: fasttext.FastText, with_timedelta: bool) -> List: # create embeddings per block but at first remove '\n' (newline character) from the end if with_timedelta: embeddings = get_embeddings_with_timedeltas_per_block(data, model) else: embeddings = [ np.asarray( [model.get_sentence_vector(log.rstrip()) for log in logs]) for logs in data.values() ] return embeddings
def fasttext_word_norm(token: str, model: fasttext.FastText) -> Optional[np.array]: # divide each word vector by its L2 norm # and we will only sum over vectors with positive L2 norms raw_word_vec = model.get_word_vector(token) norm = np.linalg.norm(raw_word_vec) if norm > 0: return raw_word_vec / np.linalg.norm(raw_word_vec) else: # these Nones are dropped in get_fasttext_sentence_vector # so that we don't sum over ones with zero norm return None
def train_model(self): if not Path(self.train_data_path).exists(): self.preprocessor.preprocess() classifier = FastText.train_supervised(input=self.train_data_path, lr=0.25, ws=4) classifier.save_model(self.model_path) self.classifier = classifier print("test result in training data:") result = classifier.test(self.train_data_path) print(result) print("test result in testing data:") result = classifier.test(self.test_data_path) print(result)
def __init__(self, save_model_path=None, train_data_path=None, test_data_path=None, type="train", k=1, threshold=0.0, epoch=5, pretrainedVectors="", label="__label__", lr=0.1, dim=100): """ fasttext :param save_model_path: 模型保存路径 :param train_data_path: 训练样本路径 :param test_data_path: 测试样本路径 :param type: 模式:“train/predict” :param k: 返回结果个数 :param threshold: 阈值 :param epoch: 训练轮数 :param pretrainedVectors: 预训练词向量路径 :param label: 标签前缀 :param lr: 学习率 :param dim: 词向量维度 """ self.train_data_path = train_data_path self.test_data_path = test_data_path self.save_model_path = save_model_path self.type = type self.k = k self.threshold = threshold self.epoch = epoch self.pretrainedVectors = pretrainedVectors self.label = label self.lr = lr self.dim = dim if self.type == "predict": if not os.path.exists(self.save_model_path): print("MODEL: {} is not EXIST ....") sys.exit() print("LOAD MODEL FROM: {}".format(self.save_model_path)) self.classifier = ff.load_model(self.save_model_path) else: self.classifier = None
def load_pretrained_embedding_from_fasttext_cc_model(self, path, freeze): self.model = FastText.load_model(path) self.embedding_dim = self.model.get_dimension() self.words = [*self.model.words, NUM, PUNCTUATION] self.word2index = {word: i for i, word in enumerate(self.words)} self.unk_index = len(self.words) self.padding_index = len(self.words) + 1 self.embeddings = nn.Embedding.from_pretrained( torch.cat([ torch.tensor(self.model.get_input_matrix()), torch.rand(3, self.embedding_dim, dtype=torch.float).uniform_( -math.sqrt(3 / self.embedding_dim), math.sqrt(3 / self.embedding_dim)), torch.zeros(1, self.embedding_dim, dtype=torch.float) ]), padding_idx=self.padding_index, freeze=freeze)
def __init__(self, fasttextmodel: fasttext.FastText): super(FastTextTorch, self).__init__() self.fasttextmodel = fasttextmodel weights = torch.from_numpy(fasttextmodel.get_input_matrix()) # Note: `vocab_size` is the size of the actual fasttext vocabulary. In pratice, the # embeddings here have two more tokens in their vocabulary: one for padding (embedding fixed # at 0, since the padding embedding never receive gradient in `nn.Embedding`) and one for # the special (root) tokens, with values sampled accross the vocabulary self.vocab_size, self.embedding_size = weights.shape root_embedding = weights[ torch.randint(high=self.vocab_size, size=(self.embedding_size, )), torch.arange(self.embedding_size), ].unsqueeze(0) weights = torch.cat((weights, torch.zeros( (1, self.embedding_size)), root_embedding), dim=0).to(torch.float) weights.requires_grad = True self.embeddings = nn.Embedding.from_pretrained( weights, padding_idx=self.vocab_size + 1)
def train(self): classifier = ff.train_supervised(self.train_data_path, label_prefix="__label__") classifier.save_model(self.save_model_path) train_result = classifier.test(self.train_data_path) print("### TRAIN RESULT ###") print("Train Samples: {}".format(train_result[0])) print("Train Precision: {}".format(train_result[1])) print("Train Recall: {}\n\n".format(train_result[2])) if self.test_data_path: test_result = classifier.test(self.test_data_path) print("### TEST RESULT ###") print("Test Samples: {}".format(test_result[0])) print("Test Precision: {}".format(test_result[1])) print("Test Recall: {}\n\n".format(test_result[2])) print("model save to {}".format(self.save_model_path))
def question_classifier_test(): """ 问题分类测试 :return: """ # 加载停用词表 stop_words = load_stop_word_list("stopwords.txt") label_to_name = load_label_name_map()[0] classifier = ff.load_model("Model/model_w" + str(wordNgrams) + "_e" + str(epoch)) while True: input_ = input("question:") seg_line = jieba.cut(input_) add_str = "" for word in seg_line: if word not in stop_words: add_str += word + " " predict = classifier.predict(add_str.strip(), 3) print(predict) for label in predict[0]: print(label_to_name[label])
def train_model(self): """ train fast-text model :return: """ classifier = FastText.train_supervised(input=self.train_data_path, lr=1, ws=4, loss="hs", epoch=25) # classifier = FastText.train_supervised(input=self.train_data_path, lr=1, wordNgrams=2, # verbose=2, minCount=1, epoch=25, loss="hs") classifier.save_model(self.model_path) self.classifier = classifier print("test result in training data:") result = classifier.test(self.train_data_path) print(result) print("test result in testing data:") result = classifier.test(self.test_data_path) print(result)
def fasttext_model_train(): """ fasttext模型训练 :return: """ # for i in range(20, 30): # for w in range(1, 3): # start_time = time.time() # classifier = ff.train_supervised("fasttext.train", epoch=i, lr=0.5, wordNgrams=w) # print("ngram=%d,训练第%d轮,用时%s" % (w, i, time.time() - start_time)) # classifier.save_model("Model/model_w" + str(w) + "_e" + str(i)) start_time = time.time() classifier = ff.train_supervised("fasttext.train", epoch=epoch, lr=0.5, wordNgrams=wordNgrams) print("ngram=%d,训练第%d轮,用时%s" % (wordNgrams, epoch, time.time() - start_time)) classifier.save_model("Model/model_w" + str(wordNgrams) + "_e" + str(epoch))
import tensorflow as tf dir_path = os.getcwd() cla_model = os.path.join('model', 'data_dim100_lr05_iter5.model') cws_model_path = os.path.join(dir_path, 'model', 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(dir_path, 'model', 'pos.model') # 词性标注模型路径,模型名称为`pos.model` ner_model_path = os.path.join(dir_path, 'model', 'ckpt') # 实体识别模型路径,模型名称为`ner.model` Ner = evaluate_line(ner_model_path) #加载实体识别模型 classifier = fasttext.load_model(cla_model) #加载文本分类模型 from pyltp import Segmentor, Postagger class PyltpTool: def __init__(self, sentence): self.sentence = sentence self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger = Postagger() self.postagger.load(pos_model_path) self.seg_sentences = [] self.pos_tags = [] self.ner_sentence = []
from fasttext import FastText from text_preprocess import TextProcess import json model_path = "model/fasttext.bin" model = FastText.load_model(model_path) with open("label_dict.json", 'r') as load_f: label_dict = json.load(load_f) def predict(text): pre_label = model.predict(TextProcess().word_preprocess(text)) label_index = pre_label[0][0].strip().strip('__label__') label_name = list(label_dict.keys())[list(label_dict.values()).index( int(label_index))] return label_name if __name__ == '__main__': text = input("Please input sentence: ") print("result:", predict(text))
# Augmenting x_train and x_test with n-grams features x_train = add_ngram(x_train, token_indice, ngram_range) x_test = add_ngram(x_test, token_indice, ngram_range) max_len = max(len(x) for x in x_train) print(max_len) print('Pad sequences...') x_train = sequence.pad_sequences(x_train, maxlen=max_len, value=0) x_test = sequence.pad_sequences(x_test, maxlen=max_len, value=0) print('Build model...') model = FastText(max_len, embedding_dim, batch_size=batch_size, class_num=2, max_features=max_features, epochs=epochs) print('Train...') model.fit(x_train, x_test, y_train, y_test) print('Test...') result = model.predict(x_test) result = np.argmax(np.array(result), axis=1) y_test = np.argmax(np.array(y_test), axis=1) print('f1:', f1_score(y_test, result, average='macro')) print('accuracy:', accuracy_score(y_test, result)) print('classification report:\n', classification_report(y_test, result)) print('confusion matrix:\n', confusion_matrix(y_test, result))
if Path.cwd().name == 'app': base_path = Path('.') else: base_path = Path('app') files_path = base_path / 'files/' gnaf_addresses = pd.read_csv(files_path / 'gnaf_addresses.csv', low_memory=False) concat_address = pd.read_csv(files_path / 'address_clean.txt', header=None)[0] mdl = FastText.train_unsupervised(input=str(files_path / 'address_clean.txt'), minCount=0, minn=0, maxn=3, dim=300, epoch=10, bucket=200000) mdl.save_model(str(files_path / 'address_resolver.mdl')) address_vecs = [ mdl.get_sentence_vector(addr) for addr in tqdm(concat_address.values) ] np.save(files_path / 'address_vecs.npy', address_vecs) raw_address = '8mayfrd avehopevally' raw_address = '55 curry st adelaid' raw_address = raw_address.upper() raw_address_vec = mdl.get_sentence_vector(raw_address) distances = pairwise_distances([raw_address_vec], address_vecs)
def train_embedding(fn): model = FastText.train_unsupervised(fn, model='skipgram', dim=300, maxn=0) model.save_model( 'data/pretrained_embedding/fasttext_pretrained_embeddings_300.bin')
if label.strip() == pre_label.strip(): precision[label.strip().strip('__label__')] += 1 for sub in precision.keys(): pre = precision[sub] / total[sub] rec = precision[sub] / recall[sub] F1 = (2 * pre * rec) / (pre + rec) label_name = list(label_dict.keys())[list(label_dict.values()).index( int(sub))] print("{}, precison:{}, recall:{}, F1:{}".format( str(label_name.strip('/')), str(pre), str(rec), str(F1))) if __name__ == '__main__': model = FastText.train_supervised(input=train_data, label_prefix="__label__", epoch=15, dim=32, lr=0.5, loss='softmax', verbose=2, minCount=3, word_ngrams=2, bucket=1000000) model.save_model('model/fasttext.bin') score = model.test(test_data) cal_precision_and_recall(test_data)
def train(self, file_path): """ 训练 :return: """ self.model = FastText.train_supervised(input=file_path, **CONFIG)
import const """ with open(os.path.join(const.DATAPATH, 'corpus.txt'), 'r') as f: lines = [] for line in f: lines.append(' '.join(line.strip().split('_')) + '\n') with open(os.path.join(const.DATAPATH, 'corp.txt'), 'w') as f: f.writelines(lines) """ #fasttext.train_unsupervised('data.txt', model='cbow') model = fasttext.train_unsupervised(os.path.join(const.DATAPATH, 'corp.txt'), model='skipgram', dim=128, epoch=10, ws=5, minCount=1, loss='hs', wordNgrams=2) model.save_model(os.path.join(const.MODELPATH, 'skipgram.bin')) model = fasttext.train_unsupervised(os.path.join(const.DATAPATH, 'corp.txt'), model='cbow', dim=128, epoch=10, ws=5, minCount=1, loss='hs', wordNgrams=2) model.save_model(os.path.join(const.MODELPATH, 'cbow.bin'))
# with open('../data/train_data.txt','r',encoding='utf-8') as file ,open('train.txt','w',encoding='utf-8') as trainfile,\ # open('text.txt','w',encoding='utf-8') as testfile: # for line in file.readlines(): # label,content=line.strip().split('\t')[1],line.strip().split('\t')[0] # content=jieba.cut(content) # content=' '.join(content) # label="__label__"+label # if random.random()>0.2: # trainfile.write(content.strip()+"\t"+label+'\n') # else: # testfile.write(content.strip()+"\t"+label+'\n') #训练监督文本,train_data.txt,模型会默认保存在当前目录下,名称为"fasttext_test.model.bin";thread表示以3个线程进行训练,不加默认1个线程 classifier = ff.train_supervised('train.txt',label='__label__',epoch=10) # 验证数据集 result = classifier.test('test.txt') # 输出准确率和召回率 print(result[1], result[2]) # 预测文本分类, articles是一段文本用字符串表示, k=3表示输入可能性较高的三个分类,不加参数默认只输出一个 result = classifier.predict("我 明天 去 北京 出差 支持 客户", k=3) print(result) # with open("test.txt","r",encoding="utf-8") as testfile: # for line in testfile.readlines(): # content,label=line.strip().split("\t") # predit_label=classifier.predict(content) # print("预测值为:",predit_label,"实际值为:",label)