예제 #1
0
def predict(sentence):

    # 预测语句
    text = sentence
    text = text.replace("\n", "").replace("\r", "").replace("\t", "")

    labels = []

    bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200)

    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])

    # 模型预测
    predicted = load_model.predict(x_train)[0]

    indices = [i for i in range(len(predicted)) if predicted[i] > 0.5]

    with open(
            "/Users/xuzhang/Documents/STUDY/Github/IntentRec/utils/event_type.json",
            "r",
            encoding="utf-8") as g:
        movie_genres = json.loads(g.read())

    #print("预测语句: %s" % text)
    #print("意图分析: %s" % "|".join([movie_genres[index] for index in indices]))
    return "|".join([movie_genres[index] for index in indices])
예제 #2
0
from keras.models import load_model
from collections import defaultdict
from pprint import pprint

from utils import MAX_SEQ_LEN, event_type
from albert_zh.extract_feature import BertVector

# 读取label2id字典
with open("%s_label2id.json" % event_type, "r", encoding="utf-8") as h:
    label_id_dict = json.loads(h.read())

id_label_dict = {v: k for k, v in label_id_dict.items()}

# 利用ALBERT提取文本特征
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=MAX_SEQ_LEN)
f = lambda text: bert_model.encode([text])["encodes"][0]

# 载入模型
custom_objects = {
    'CRF': CRF,
    'crf_loss': crf_loss,
    'crf_viterbi_accuracy': crf_viterbi_accuracy
}
ner_model = load_model("%s_ner.h5" % event_type, custom_objects=custom_objects)


# 从预测的标签列表中获取实体
def get_entity(sent, tags_list):

    entity_dict = defaultdict(list)
    i = 0
예제 #3
0
    '研究表明,每个网页的平均预期寿命为44至100天。当用户通过浏览器访问已消失的网页时,就会看到「Page Not Found」的错误信息。对于这种情况,相信大多数人也只能不了了之。不过有责任心的组织——互联网档案馆为了提供更可靠的Web服务,它联手Brave浏览器专门针对此类网页提供了一键加载存档页面的功能。',
    '据外媒报道,土耳其总统府于当地时间2日表示,土耳其总统埃尔多安计划于5日对俄罗斯进行为期一天的访问。',
    '3日,根据三星电子的消息,李在镕副会长这天访问了位于韩国庆尚北道龟尾市的三星电子工厂。'
] * 10

labels = []

bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100)

init_time = time.time()

# 对上述句子进行预测
for text in texts:

    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])

    # 模型预测
    predicted = load_model.predict(x_train)
    y = np.argmax(predicted[0])
    label = 'Y' if y else 'N'
    labels.append(label)

cost_time = time.time() - init_time
print("Average cost time: %s." % (cost_time / len(texts)))

for text, label in zip(texts, labels):
    print('%s\t%s' % (label, text))

df = pd.DataFrame({'句子': texts, "是否属于出访类事件": labels})
예제 #4
0
for line in train_content:
    genres = line.split('\t', maxsplit=1)[0].split('|')
    y_train.append(mlb.transform([genres])[0])

for line in test_content:
    genres = line.split('\t', maxsplit=1)[0].split('|')
    y_test.append(mlb.transform([genres])[0])

y_train = np.array(y_train)
y_test = np.array(y_test)

# 利用ALBERT对x值进行编码

bert_model = BertVector(pooling_strategy="NONE", max_seq_len=200)
print('begin encoding')
f = lambda text: bert_model.encode([text])['encodes'][0]

x_train = []
x_test = []
process_bar = tqdm(train_content)

for ch, line in zip(process_bar, train_content):
    movie_intro = line.split('\t', maxsplit=1)[1]
    x_train.append(f(movie_intro))

process_bar = tqdm(test_content)

for ch, line in zip(process_bar, test_content):
    movie_intro = line.split('\t', maxsplit=1)[1]
    x_test.append(f(movie_intro))