def predict(sentence): # 预测语句 text = sentence text = text.replace("\n", "").replace("\r", "").replace("\t", "") labels = [] bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200) # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train)[0] indices = [i for i in range(len(predicted)) if predicted[i] > 0.5] with open( "/Users/xuzhang/Documents/STUDY/Github/IntentRec/utils/event_type.json", "r", encoding="utf-8") as g: movie_genres = json.loads(g.read()) #print("预测语句: %s" % text) #print("意图分析: %s" % "|".join([movie_genres[index] for index in indices])) return "|".join([movie_genres[index] for index in indices])
from keras.models import load_model from collections import defaultdict from pprint import pprint from utils import MAX_SEQ_LEN, event_type from albert_zh.extract_feature import BertVector # 读取label2id字典 with open("%s_label2id.json" % event_type, "r", encoding="utf-8") as h: label_id_dict = json.loads(h.read()) id_label_dict = {v: k for k, v in label_id_dict.items()} # 利用ALBERT提取文本特征 bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN) f = lambda text: bert_model.encode([text])["encodes"][0] # 载入模型 custom_objects = { 'CRF': CRF, 'crf_loss': crf_loss, 'crf_viterbi_accuracy': crf_viterbi_accuracy } ner_model = load_model("%s_ner.h5" % event_type, custom_objects=custom_objects) # 从预测的标签列表中获取实体 def get_entity(sent, tags_list): entity_dict = defaultdict(list) i = 0
'研究表明,每个网页的平均预期寿命为44至100天。当用户通过浏览器访问已消失的网页时,就会看到「Page Not Found」的错误信息。对于这种情况,相信大多数人也只能不了了之。不过有责任心的组织——互联网档案馆为了提供更可靠的Web服务,它联手Brave浏览器专门针对此类网页提供了一键加载存档页面的功能。', '据外媒报道,土耳其总统府于当地时间2日表示,土耳其总统埃尔多安计划于5日对俄罗斯进行为期一天的访问。', '3日,根据三星电子的消息,李在镕副会长这天访问了位于韩国庆尚北道龟尾市的三星电子工厂。' ] * 10 labels = [] bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100) init_time = time.time() # 对上述句子进行预测 for text in texts: # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train) y = np.argmax(predicted[0]) label = 'Y' if y else 'N' labels.append(label) cost_time = time.time() - init_time print("Average cost time: %s." % (cost_time / len(texts))) for text, label in zip(texts, labels): print('%s\t%s' % (label, text)) df = pd.DataFrame({'句子': texts, "是否属于出访类事件": labels})
for line in train_content: genres = line.split('\t', maxsplit=1)[0].split('|') y_train.append(mlb.transform([genres])[0]) for line in test_content: genres = line.split('\t', maxsplit=1)[0].split('|') y_test.append(mlb.transform([genres])[0]) y_train = np.array(y_train) y_test = np.array(y_test) # 利用ALBERT对x值进行编码 bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200) print('begin encoding') f = lambda text: bert_model.encode([text])['encodes'][0] x_train = [] x_test = [] process_bar = tqdm(train_content) for ch, line in zip(process_bar, train_content): movie_intro = line.split('\t', maxsplit=1)[1] x_train.append(f(movie_intro)) process_bar = tqdm(test_content) for ch, line in zip(process_bar, test_content): movie_intro = line.split('\t', maxsplit=1)[1] x_test.append(f(movie_intro))