예제 #1
0
class Processor(object):
    def __init__(self, train_path, token_dict):
        self.train_path = train_path
        self.tokenizer = Tokenizer(token_dict)

    def get_tags(self):
        tags = set()
        train_data = self.get_data(self.train_path)
        for item in train_data:
            for tag in item[1].split(" "):
                tags.add(tag)
        # PAD-tag用X
        self.tag2id = {list(tags)[i]: i for i in range(len(tags))}
        self.tag2id["X"] = len(self.tag2id)
        self.id2tag = {self.tag2id[i]: i for i in self.tag2id}
        return self.tag2id, self.id2tag

    def get_data(self, path):
        with codecs.open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data

    def get_bert_inputs(self, path, max_len):
        srcs = self.get_data(path)
        src_data, src_tags = [], []
        for item in srcs:
            src_data.append(item[0])
            src_tags.append(item[1])
        tokens, segs, tags = [], [], []
        for item in src_data:
            res = self.tokenizer.encode(item, first_length=max_len)
            tokens.append(np.array(res[0]))
            segs.append(np.array(res[1]))
        max_len -= 2
        for item in src_tags:
            len_item = len(item.split(" "))
            if len_item >= max_len:
                tags.append(["X"] + item.split(" ")[:max_len] + ["X"])
            else:
                tags.append(["X"] + item.split(" ") + ["X"] *
                            (max_len - len_item + 1))
        tags = [[self.tag2id[item] for item in term[1:]] for term in tags]
        tags = np.expand_dims(tags, axis=-1)
        return tokens, segs, tags
# 测试代码可用性: 提取特征

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer
from keras.models import load_model
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
    0.39056838]
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import Tokenizer, load_vocab
import numpy as np

config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = Tokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              with_mlm=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
예제 #4
0
dict_path = 'albert_tiny_zh_google/vocab.txt'

tokenizer = Tokenizer(dict_path)

# 加载预训练模型
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    with_pool=True,
    albert=True,
    return_keras_model=False,
)

model = Model(bert.model.input, bert.model.output)

token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京')
token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港')
token_ids3, segment_ids3 = tokenizer.encode(u'目前的局势,止暴制乱,刻不容缓')

sentence_vec1 = model.predict(
    [np.array([token_ids1]), np.array([segment_ids1])])[0]
sentence_vec2 = model.predict(
    [np.array([token_ids2]), np.array([segment_ids2])])[0]
sentence_vec3 = model.predict(
    [np.array([token_ids3]), np.array([segment_ids3])])[0]

print("《我想去北京》和《我想去香港》的余弦距离为%f" %
      similarity_count(sentence_vec1, sentence_vec2))
print("《我想去北京》和《我想去香港》的欧式距离为%f" %
      similarity_count(sentence_vec1, sentence_vec2, mode='eu'))
예제 #5
0
dict_path = 'albert_tiny_zh_google/vocab.txt'

tokenizer = Tokenizer(dict_path)

# 加载预训练模型
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    with_pool=True,
    albert=True,
    return_keras_model=False,
)

model = Model(bert.model.input, bert.model.output)

token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京逛一逛天安门')
token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港')
token_ids3, segment_ids3 = tokenizer.encode(u'我想到天安门广场走一走')

sentence_vec1 = model.predict(
    [np.array([token_ids1]), np.array([segment_ids1])])[0]
sentence_vec2 = model.predict(
    [np.array([token_ids2]), np.array([segment_ids2])])[0]
sentence_vec3 = model.predict(
    [np.array([token_ids3]), np.array([segment_ids3])])[0]

print("《我想去北京》和《我想去香港》的余弦距离为%f" %
      similarity_count(sentence_vec1, sentence_vec2))
print("《我想去北京》和《我想去香港》的欧式距离为%f" %
      similarity_count(sentence_vec1, sentence_vec2, mode='eu'))