예제 #1
0
def predict(sentence):

    # 预测语句
    text = sentence
    text = text.replace("\n", "").replace("\r", "").replace("\t", "")

    labels = []

    bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200)

    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])

    # 模型预测
    predicted = load_model.predict(x_train)[0]

    indices = [i for i in range(len(predicted)) if predicted[i] > 0.5]

    with open(
            "/Users/xuzhang/Documents/STUDY/Github/IntentRec/utils/event_type.json",
            "r",
            encoding="utf-8") as g:
        movie_genres = json.loads(g.read())

    #print("预测语句: %s" % text)
    #print("意图分析: %s" % "|".join([movie_genres[index] for index in indices]))
    return "|".join([movie_genres[index] for index in indices])
예제 #2
0
from keras_contrib.metrics import crf_accuracy, crf_viterbi_accuracy
from keras.models import load_model
from collections import defaultdict
from pprint import pprint

from utils import MAX_SEQ_LEN, event_type
from albert_zh.extract_feature import BertVector

# 读取label2id字典
with open("%s_label2id.json" % event_type, "r", encoding="utf-8") as h:
    label_id_dict = json.loads(h.read())

id_label_dict = {v: k for k, v in label_id_dict.items()}

# 利用ALBERT提取文本特征
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN)
f = lambda text: bert_model.encode([text])["encodes"][0]

# 载入模型
custom_objects = {
    'CRF': CRF,
    'crf_loss': crf_loss,
    'crf_viterbi_accuracy': crf_viterbi_accuracy
}
ner_model = load_model("%s_ner.h5" % event_type, custom_objects=custom_objects)


# 从预测的标签列表中获取实体
def get_entity(sent, tags_list):

    entity_dict = defaultdict(list)
예제 #3
0
    '联合国秘书长潘基文8日访问了日本福岛县,与当地灾民交流并访问了一所高中。',
    '国务院总理温家宝当地时间23日下午乘专机抵达布宜诺斯艾利斯,开始对阿根廷进行正式访问。',
    '正在中国访问的巴巴多斯总理斯图尔特15日在陕西西安参观访问。',
    '据外媒报道,当地时间10日,美国白宫发声明称,美国总统特朗普将于2月底访问印度,与印度总理莫迪进行战略对话。',
    '2月28日,唐山曹妃甸蓝色海洋科技有限公司董事长赵力军等一行5人到黄海水产研究所交流访问。黄海水产研究所副所长辛福言及相关部门负责人、专家等参加了会议。',
    '2018年7月2日,莫斯科孔子文化促进会会长姜彦彬,常务副会长陈国建,在中国著名留俄油画大师牟克教授的陪同下,访问了莫斯科国立苏里科夫美术学院,受到第一副校长伊戈尔·戈尔巴秋克先生接待。'
    '据外媒报道,当地时间26日晚,阿尔及利亚总统特本抵达沙特阿拉伯,进行为期三天的访问。两国领导人预计将就国家间合作和地区发展进行磋商。',
    '与标准Mozy一样,Stash文件夹为用户提供了对其备份文件的基于云的访问,但是它们还使他们可以随时,跨多个设备(包括所有计算机,智能手机和平板电脑)访问它们。换句话说,使用浏览器的任何人都可以同时查看文件(如果需要)。操作系统和设备品牌无关。',
    '研究表明,每个网页的平均预期寿命为44至100天。当用户通过浏览器访问已消失的网页时,就会看到「Page Not Found」的错误信息。对于这种情况,相信大多数人也只能不了了之。不过有责任心的组织——互联网档案馆为了提供更可靠的Web服务,它联手Brave浏览器专门针对此类网页提供了一键加载存档页面的功能。',
    '据外媒报道,土耳其总统府于当地时间2日表示,土耳其总统埃尔多安计划于5日对俄罗斯进行为期一天的访问。',
    '3日,根据三星电子的消息,李在镕副会长这天访问了位于韩国庆尚北道龟尾市的三星电子工厂。'
] * 10

labels = []

bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100)

init_time = time.time()

# 对上述句子进行预测
for text in texts:

    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])

    # 模型预测
    predicted = load_model.predict(x_train)
    y = np.argmax(predicted[0])
    label = 'Y' if y else 'N'
    labels.append(label)
# @File : model_evaluate.py
# @Place : Yangpu, Shanghai
# 模型评估脚本,利用hamming_loss作为多标签分类的评估指标,该值越小模型效果越好
import json
import numpy as np
import pandas as pd
from keras.models import load_model
from sklearn.metrics import hamming_loss, classification_report
from att import Attention
from albert_zh.extract_feature import BertVector

# 加载训练好的模型
model = load_model("event_type.h5", custom_objects={"Attention": Attention})
with open("event_type.json", "r", encoding="utf-8") as f:
    event_type_list = json.loads(f.read())
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200)


# 对单句话进行预测
def predict_single_text(text):
    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])

    # 模型预测
    predicted = model.predict(x_train)[0]
    indices = [i for i in range(len(predicted)) if predicted[i] > 0.5]
    one_hot = [0] * len(event_type_list)
    for index in indices:
        one_hot[index] = 1
예제 #5
0
# author: Jclian91
# place: Pudong Shanghai
# time: 2020/5/15 3:44 下午

import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from load_data import train_df, test_df
from albert_zh.extract_feature import BertVector

# 读取文件并进行转换
bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=200)
print('begin encoding')
f = lambda text: bert_model.encode([text])["encodes"][0]
train_df['x'] = train_df['text'].apply(f)
test_df['x'] = test_df['text'].apply(f)
print('end encoding')

x_train = np.array([vec for vec in train_df['x']])
x_test = np.array([vec for vec in test_df['x']])
y_train = np.array([vec for vec in train_df['label']])
y_test = np.array([vec for vec in test_df['label']])
print('x_train: ', x_train.shape)

# Logistic Regression
lr = LR(random_state=123)
lr.fit(x_train, y_train)
import json
import numpy as np
from keras.models import load_model

from att import Attention
from albert_zh.extract_feature import BertVector
load_model = load_model("event_type.h5", custom_objects={"Attention": Attention})

# 预测语句
text = "昨天18:30,陕西宁强县胡家坝镇向家沟村三组发生山体坍塌,5人被埋。当晚,3人被救出,其中1人在医院抢救无效死亡,2人在送医途中死亡。今天凌晨,另外2人被发现,已无生命迹象。"
text = text.replace("\n", "").replace("\r", "").replace("\t", "")

labels = []

bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200)

# 将句子转换成向量
vec = bert_model.encode([text])["encodes"][0]
x_train = np.array([vec])

# 模型预测
predicted = load_model.predict(x_train)[0]

indices = [i for i in range(len(predicted)) if predicted[i] > 0.5]

with open("event_type.json", "r", encoding="utf-8") as g:
    movie_genres = json.loads(g.read())

print("预测语句: %s" % text)
print("预测事件类型: %s" % "|".join([movie_genres[index] for index in indices]))
예제 #7
0
y_train = []
y_test = []
for line in train_content:
    genres = line.split('\t', maxsplit=1)[0].split('|')
    y_train.append(mlb.transform([genres])[0])

for line in test_content:
    genres = line.split('\t', maxsplit=1)[0].split('|')
    y_test.append(mlb.transform([genres])[0])

y_train = np.array(y_train)
y_test = np.array(y_test)

# 利用ALBERT对x值进行编码

bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200)
print('begin encoding')
f = lambda text: bert_model.encode([text])['encodes'][0]

x_train = []
x_test = []
process_bar = tqdm(train_content)

for ch, line in zip(process_bar, train_content):
    movie_intro = line.split('\t', maxsplit=1)[1]
    x_train.append(f(movie_intro))

process_bar = tqdm(test_content)

for ch, line in zip(process_bar, test_content):
    movie_intro = line.split('\t', maxsplit=1)[1]
from albert_zh.extract_feature import BertVector
from tqdm import tqdm

MAX_SEQ_LEN = 200
import numpy as np

bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN)


def cos_sim(vector_a, vector_b):
    """
    计算两个向量之间的余弦相似度
    :param vector_a: 向量 a
    :param vector_b: 向量 b
    :return: sim
    """
    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim


def sim_text(s1, s2):
    sent_vec1 = bert_model.encode([s1])["encodes"][0].mean(axis=1)
    sent_vec2 = bert_model.encode([s2])["encodes"][0].mean(axis=1)
    return cos_sim(sent_vec1, sent_vec2)

예제 #9
0
import tornado.ioloop
import tornado.options
import tornado.web
from tornado.options import define, options

import json
import numpy as np
from albert_zh.extract_feature import BertVector
from keras.models import load_model


# 定义端口为10008
define("port", default=10008, help="run on the given port", type=int)

# 加载ALBERT
bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100)
# 加载已经训练好的模型
load_model = load_model("visit_classify.h5")


# 对句子进行预测
class PredictHandler(tornado.web.RequestHandler):

    def post(self):

        text = self.get_argument("text")

        # 将句子转换成向量
        vec = bert_model.encode([text])["encodes"][0]
        x_train = np.array([vec])