def predict(sentence): # 预测语句 text = sentence text = text.replace("\n", "").replace("\r", "").replace("\t", "") labels = [] bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200) # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train)[0] indices = [i for i in range(len(predicted)) if predicted[i] > 0.5] with open( "/Users/xuzhang/Documents/STUDY/Github/IntentRec/utils/event_type.json", "r", encoding="utf-8") as g: movie_genres = json.loads(g.read()) #print("预测语句: %s" % text) #print("意图分析: %s" % "|".join([movie_genres[index] for index in indices])) return "|".join([movie_genres[index] for index in indices])
from keras_contrib.metrics import crf_accuracy, crf_viterbi_accuracy from keras.models import load_model from collections import defaultdict from pprint import pprint from utils import MAX_SEQ_LEN, event_type from albert_zh.extract_feature import BertVector # 读取label2id字典 with open("%s_label2id.json" % event_type, "r", encoding="utf-8") as h: label_id_dict = json.loads(h.read()) id_label_dict = {v: k for k, v in label_id_dict.items()} # 利用ALBERT提取文本特征 bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN) f = lambda text: bert_model.encode([text])["encodes"][0] # 载入模型 custom_objects = { 'CRF': CRF, 'crf_loss': crf_loss, 'crf_viterbi_accuracy': crf_viterbi_accuracy } ner_model = load_model("%s_ner.h5" % event_type, custom_objects=custom_objects) # 从预测的标签列表中获取实体 def get_entity(sent, tags_list): entity_dict = defaultdict(list)
'联合国秘书长潘基文8日访问了日本福岛县,与当地灾民交流并访问了一所高中。', '国务院总理温家宝当地时间23日下午乘专机抵达布宜诺斯艾利斯,开始对阿根廷进行正式访问。', '正在中国访问的巴巴多斯总理斯图尔特15日在陕西西安参观访问。', '据外媒报道,当地时间10日,美国白宫发声明称,美国总统特朗普将于2月底访问印度,与印度总理莫迪进行战略对话。', '2月28日,唐山曹妃甸蓝色海洋科技有限公司董事长赵力军等一行5人到黄海水产研究所交流访问。黄海水产研究所副所长辛福言及相关部门负责人、专家等参加了会议。', '2018年7月2日,莫斯科孔子文化促进会会长姜彦彬,常务副会长陈国建,在中国著名留俄油画大师牟克教授的陪同下,访问了莫斯科国立苏里科夫美术学院,受到第一副校长伊戈尔·戈尔巴秋克先生接待。' '据外媒报道,当地时间26日晚,阿尔及利亚总统特本抵达沙特阿拉伯,进行为期三天的访问。两国领导人预计将就国家间合作和地区发展进行磋商。', '与标准Mozy一样,Stash文件夹为用户提供了对其备份文件的基于云的访问,但是它们还使他们可以随时,跨多个设备(包括所有计算机,智能手机和平板电脑)访问它们。换句话说,使用浏览器的任何人都可以同时查看文件(如果需要)。操作系统和设备品牌无关。', '研究表明,每个网页的平均预期寿命为44至100天。当用户通过浏览器访问已消失的网页时,就会看到「Page Not Found」的错误信息。对于这种情况,相信大多数人也只能不了了之。不过有责任心的组织——互联网档案馆为了提供更可靠的Web服务,它联手Brave浏览器专门针对此类网页提供了一键加载存档页面的功能。', '据外媒报道,土耳其总统府于当地时间2日表示,土耳其总统埃尔多安计划于5日对俄罗斯进行为期一天的访问。', '3日,根据三星电子的消息,李在镕副会长这天访问了位于韩国庆尚北道龟尾市的三星电子工厂。' ] * 10 labels = [] bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100) init_time = time.time() # 对上述句子进行预测 for text in texts: # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train) y = np.argmax(predicted[0]) label = 'Y' if y else 'N' labels.append(label)
# @File : model_evaluate.py # @Place : Yangpu, Shanghai # 模型评估脚本,利用hamming_loss作为多标签分类的评估指标,该值越小模型效果越好 import json import numpy as np import pandas as pd from keras.models import load_model from sklearn.metrics import hamming_loss, classification_report from att import Attention from albert_zh.extract_feature import BertVector # 加载训练好的模型 model = load_model("event_type.h5", custom_objects={"Attention": Attention}) with open("event_type.json", "r", encoding="utf-8") as f: event_type_list = json.loads(f.read()) bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200) # 对单句话进行预测 def predict_single_text(text): # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = model.predict(x_train)[0] indices = [i for i in range(len(predicted)) if predicted[i] > 0.5] one_hot = [0] * len(event_type_list) for index in indices: one_hot[index] = 1
# author: Jclian91 # place: Pudong Shanghai # time: 2020/5/15 3:44 下午 import numpy as np from sklearn.linear_model import LogisticRegression as LR from sklearn.metrics import confusion_matrix, accuracy_score, classification_report from sklearn.externals import joblib from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from load_data import train_df, test_df from albert_zh.extract_feature import BertVector # 读取文件并进行转换 bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=200) print('begin encoding') f = lambda text: bert_model.encode([text])["encodes"][0] train_df['x'] = train_df['text'].apply(f) test_df['x'] = test_df['text'].apply(f) print('end encoding') x_train = np.array([vec for vec in train_df['x']]) x_test = np.array([vec for vec in test_df['x']]) y_train = np.array([vec for vec in train_df['label']]) y_test = np.array([vec for vec in test_df['label']]) print('x_train: ', x_train.shape) # Logistic Regression lr = LR(random_state=123) lr.fit(x_train, y_train)
import json import numpy as np from keras.models import load_model from att import Attention from albert_zh.extract_feature import BertVector load_model = load_model("event_type.h5", custom_objects={"Attention": Attention}) # 预测语句 text = "昨天18:30,陕西宁强县胡家坝镇向家沟村三组发生山体坍塌,5人被埋。当晚,3人被救出,其中1人在医院抢救无效死亡,2人在送医途中死亡。今天凌晨,另外2人被发现,已无生命迹象。" text = text.replace("\n", "").replace("\r", "").replace("\t", "") labels = [] bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200) # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train)[0] indices = [i for i in range(len(predicted)) if predicted[i] > 0.5] with open("event_type.json", "r", encoding="utf-8") as g: movie_genres = json.loads(g.read()) print("预测语句: %s" % text) print("预测事件类型: %s" % "|".join([movie_genres[index] for index in indices]))
y_train = [] y_test = [] for line in train_content: genres = line.split('\t', maxsplit=1)[0].split('|') y_train.append(mlb.transform([genres])[0]) for line in test_content: genres = line.split('\t', maxsplit=1)[0].split('|') y_test.append(mlb.transform([genres])[0]) y_train = np.array(y_train) y_test = np.array(y_test) # 利用ALBERT对x值进行编码 bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200) print('begin encoding') f = lambda text: bert_model.encode([text])['encodes'][0] x_train = [] x_test = [] process_bar = tqdm(train_content) for ch, line in zip(process_bar, train_content): movie_intro = line.split('\t', maxsplit=1)[1] x_train.append(f(movie_intro)) process_bar = tqdm(test_content) for ch, line in zip(process_bar, test_content): movie_intro = line.split('\t', maxsplit=1)[1]
from albert_zh.extract_feature import BertVector from tqdm import tqdm MAX_SEQ_LEN = 200 import numpy as np bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN) def cos_sim(vector_a, vector_b): """ 计算两个向量之间的余弦相似度 :param vector_a: 向量 a :param vector_b: 向量 b :return: sim """ vector_a = np.mat(vector_a) vector_b = np.mat(vector_b) num = float(vector_a * vector_b.T) denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b) cos = num / denom sim = 0.5 + 0.5 * cos return sim def sim_text(s1, s2): sent_vec1 = bert_model.encode([s1])["encodes"][0].mean(axis=1) sent_vec2 = bert_model.encode([s2])["encodes"][0].mean(axis=1) return cos_sim(sent_vec1, sent_vec2)
import tornado.ioloop import tornado.options import tornado.web from tornado.options import define, options import json import numpy as np from albert_zh.extract_feature import BertVector from keras.models import load_model # 定义端口为10008 define("port", default=10008, help="run on the given port", type=int) # 加载ALBERT bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100) # 加载已经训练好的模型 load_model = load_model("visit_classify.h5") # 对句子进行预测 class PredictHandler(tornado.web.RequestHandler): def post(self): text = self.get_argument("text") # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec])