Ejemplo n.º 1
0
    def test_vector(self):
        text = '语言模型'
        tokens1, segs1 = self.sjl_tokenizer.encode(text)
        tokens2, segs2 = self.my_tokenizer.transform(text)
        self.assertEqual(tokens1, tokens2)
        self.assertEqual(segs1, segs2)
        tokens1, segs1 = to_array([tokens1], [segs1])
        tokens2, segs2 = to_array([tokens2], [segs2])

        from bert4keras.models import build_transformer_model
        print(build_transformer_model.__module__)
        model = build_transformer_model(config_path, checkpoint_path)
        res1 = model.predict([tokens1, segs1])
        del model
        gc.collect()

        from garnet.models.build import build_transformer_model
        print(build_transformer_model.__module__)
        model = build_transformer_model(config_path, checkpoint_path)
        res2 = model.predict([tokens2, segs2])
        del model
        gc.collect()

        shape = res1.shape

        self.assertEqual(np.sum(res1), np.sum(res2))

        for k in range(shape[0]):
            for i in range(shape[1]):
                for j in range(shape[2]):
                    self.assertAlmostEqual(res1[k, i, j], res2[k, i, j])
Ejemplo n.º 2
0
def get_similarity_bert(strx, stry, bm ,tokenizer):
    token_ids, segment_ids = tokenizer.encode(strx)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    a = bm.predict([token_ids, segment_ids])
    token_ids, segment_ids = tokenizer.encode(stry)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    b = bm.predict([token_ids, segment_ids])
    return cos_sim(a[0][0], b[0][0])
Ejemplo n.º 3
0
 def generate(self, context):
     token_ids, segment_ids = tokenizer.encode(context)
     context_len = len(token_ids)
     segment_id = segment_ids[-1] + 1
     sentence = ''
     words = []
     gen_tokens = []
     for i in range(self.maxlen):
         token_ids.append(tokenizer._token_dict['[MASK]'])
         segment_ids.append(segment_id)
         tokens, segments = to_array([token_ids], [segment_ids])
         probas = biden_model.predict([tokens, segments])[0]
         # token = probas[context_len + i].argmax()
         ids = np.argsort(probas[context_len + i])[::-1]
         for token in ids:
             if token not in gen_tokens:
                 gen_tokens.append(token)
                 break
         words.append(tokenizer.decode([token]))
         token_ids[context_len + i] = token
         if token in self.end_id:
             sentence = ' '.join(words)
             return sentence
     sentence = ' '.join(words)
     sentence += '.'
     return sentence
Ejemplo n.º 4
0
 def recognize(self, text):
     tokens = tokenizer.tokenize(text)
     # while len(tokens) > 512:
     #     tokens.pop(-2)
     mapping = tokenizer.rematch(text, tokens)
     token_ids = tokenizer.tokens_to_ids(tokens)
     segment_ids = [0] * len(token_ids)
     token_ids, segment_ids = to_array([token_ids], [segment_ids])
     nodes = model.predict([token_ids, segment_ids])[0]
     labels = self.decode(nodes)
     entities, starting = [], False
     for i, label in enumerate(labels):
         if label > 0:
             if label % 2 == 1:
                 starting = True
                 entities.append([[i], id2label[(label - 1) // 2]])
             else:
                 if starting:
                     entities[-1][0].append(i)
             # else:
             #     starting = False
         else:
             starting = False
     ner_answer = []
     for w, l in entities:
         ner_answer.append([mapping[w[0]][0], mapping[w[-1]][-1] + 1, l])
     return ner_answer
Ejemplo n.º 5
0
    def recognize(self, text, tokenizer, models, loader):
        tokens = tokenizer.tokenize(text)  # 将文本切分成字符
        mapping = tokenizer.rematch(text, tokens)  # 将字符按顺序映射成id
        token_ids = tokenizer.tokens_to_ids(tokens)  # 将字符按字典映射成id
        segment_ids = [0] * len(token_ids)
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        nodes = 0
        if isinstance(models, list):
            for model in models:
                nodes += model.predict([token_ids, segment_ids
                                        ])[0]  # shape[len(text), 27]
            nodes /= len(models)
        else:
            nodes = models.predict([token_ids,
                                    segment_ids])[0]  # shape[len(text), 27]
        labels = self.decode(nodes)
        entities, starting = [], False
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:  # 如果标签id为奇数则为实体起始位置
                    starting = True
                    entities.append([[i], loader.id2label[(label - 1) // 2]
                                     ])  # 根据起始位置的label确定实体类别
                elif starting:
                    entities[-1][0].append(
                        i)  # 实体内部字符的下标加入列表,第二个维度只有一个值为类别名称,所以都取0
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]
Ejemplo n.º 6
0
    def recognize(self, text):
        """

        :param text:
        :return:
        """
        tokens = tokenizer.tokenize(text)
        while len(tokens) > 512:
            tokens.pop(-2)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        nodes = model.predict([token_ids, segment_ids])[0]
        labels = self.decode(nodes)
        entities, starting = [], False
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 2]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]
Ejemplo n.º 7
0
def extract_spoes(text, threshold=0):
    """抽取输入text所包含的三元组
    """
    tokens = tokenizer.tokenize(text, maxlen=maxlen)
    mapping = tokenizer.rematch(text, tokens)
    token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    outputs = model.predict([token_ids, segment_ids])
    outputs = [o[0] for o in outputs]
    # 抽取subject和object
    subjects, objects = set(), set()
    outputs[0][:, [0, -1]] -= np.inf
    outputs[0][:, :, [0, -1]] -= np.inf
    for l, h, t in zip(*np.where(outputs[0] > threshold)):
        if l == 0:
            subjects.add((h, t))
        else:
            objects.add((h, t))
    # 识别对应的predicate
    spoes = set()
    for sh, st in subjects:
        for oh, ot in objects:
            p1s = np.where(outputs[1][:, sh, oh] > threshold)[0]
            p2s = np.where(outputs[2][:, st, ot] > threshold)[0]
            ps = set(p1s) & set(p2s)
            for p in ps:
                spoes.add(
                    (text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[p],
                     text[mapping[oh][0]:mapping[ot][-1] + 1]))
    return list(spoes)
Ejemplo n.º 8
0
    def recognize(self, text):

        ###########   为了获取输入向量
        tokens = tokenizer.tokenize(text)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        ############

        ############   segment_ids往往是用来判断输入几句话的 同一输入的是一句话 因此全部是0
        segment_ids = [0] * len(token_ids)
        ############

        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        nodes = model.predict([token_ids, segment_ids])[0]
        labels = self.decode(nodes)
        entities, starting = [], False
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 2]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]
Ejemplo n.º 9
0
 def extract_features(self, text: str):
     """
     编码测试
     :return:
     """
     token_ids, segment_ids = self.tokenizer.encode(u'{}'.format(text))
     token_ids, segment_ids = to_array([token_ids], [segment_ids])
     print("\n === features === \n")
     print(self.predict([token_ids, segment_ids]))
Ejemplo n.º 10
0
def vec2(tex):

    # 编码测试
    token_ids, segment_ids = tokenizer.encode(tex)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])

    tmp = model.predict([token_ids, segment_ids])[:, 0, :]
    # print(tmp)
    return tmp
Ejemplo n.º 11
0
def vec2(tex):

    # 编码测试
    token_ids, segment_ids = tokenizer.encode(tex)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    global tmp
    # tmp=model.predict([token_ids, segment_ids])[:,0,:] # 0代表取[CLS]
    tmp3=tmp(tex)
    # print(tmp)
    return tmp3
 def recognize(self, text, threshold=0):
     tokens = tokenizer.tokenize(text, maxlen=512)
     mapping = tokenizer.rematch(text, tokens)
     token_ids = tokenizer.tokens_to_ids(tokens)
     segment_ids = [0] * len(token_ids)
     token_ids, segment_ids = to_array([token_ids], [segment_ids])
     scores = model.predict([token_ids, segment_ids])[0]
     scores[:, [0, -1]] -= np.inf
     scores[:, :, [0, -1]] -= np.inf
     entities = []
     for l, start, end in zip(*np.where(scores > threshold)):
         entities.append(
             (mapping[start][0], mapping[end][-1], categories[l]))
     return entities
Ejemplo n.º 13
0
def evaluate(data, Seq_ner, Tag_ner, model):
    """评测函数
    """
    token_list, seq_list, tag_list = data
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for token, seq, tag in tqdm(zip(token_list, seq_list, tag_list)):
        token_ids = to_array([token])
        P = model.predict([token_ids])
        S, T = list(Seq_ner.decode(P[0][0])), list(Tag_ner.decode(P[1][0]))
        X += 1 if (S == seq) else 0
        Y += 1 if (T == tag) else 0
        Z += 1
    seq_acc, tag_acc = X / Z, Y / Z
    return seq_acc, tag_acc
Ejemplo n.º 14
0
 def test_masked_predict(self):
     text = "科学技术是第一生产力"
     tokens, segs = self.tokenizer.transform(text)
     print(tokens)
     tokens[3] = tokens[4] = self.tokenizer.token2id(
         self.tokenizer.token_mask)
     print(tokens)
     tokens, segs = to_array([tokens], [segs])
     probs = self.model.predict([tokens, segs])[1][0]
     pred_ids = probs.argmax(axis=1)
     print(pred_ids)
     text = self.tokenizer.reverse_transform(list(pred_ids))
     print(text)
     self.assertEqual(text[3:5], '技术')
Ejemplo n.º 15
0
def predict_to_file(in_file, out_file):
    """输出预测结果到文件
    结果文件可以提交到 https://www.cluebenchmarks.com 评测。
    """
    fw = open(out_file, 'w')
    with open(in_file) as fr:
        for l in tqdm(fr):
            l = json.loads(l)
            text = l['sentence']
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            token_ids, segment_ids = to_array([token_ids], [segment_ids])
            label = model.predict([token_ids, segment_ids])[0].argmax()
            l = json.dumps({'id': str(l['id']), 'label': str(label)})
            fw.write(l + '\n')
    fw.close()
Ejemplo n.º 16
0
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            token_ids, segment_ids = to_array([token_ids, segment_ids])

            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Ejemplo n.º 17
0
 def tokenize(self, text):
     tokens = tokenizer.tokenize(text)
     while len(tokens) > 512:
         tokens.pop(-2)
     mapping = tokenizer.rematch(text, tokens)
     token_ids = tokenizer.tokens_to_ids(tokens)
     segment_ids = [0] * len(token_ids)
     token_ids, segment_ids = to_array([token_ids], [segment_ids])
     nodes = model.predict([token_ids, segment_ids])[0]
     labels = self.decode(nodes)
     words = []
     for i, label in enumerate(labels[1:-1]):
         if label < 2 or len(words) == 0:
             words.append([i + 1])
         else:
             words[-1].append(i + 1)
     return [text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words]
Ejemplo n.º 18
0
def extract_spoes(text):
    """抽取输入text所包含的三元组
    """
    tokens = tokenizer.tokenize(text, maxlen=maxlen)
    mapping = tokenizer.rematch(text, tokens)
    token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    # 抽取subject
    subject_preds = subject_model.predict([token_ids, segment_ids])
    subject_preds[:, [0, -1]] *= 0
    start = np.where(subject_preds[0, :, 0] > 0.6)[0]
    end = np.where(subject_preds[0, :, 1] > 0.5)[0]
    subjects = []
    for i in start:
        j = end[end >= i]
        if len(j) > 0:
            j = j[0]
            subjects.append((i, j))
    if subjects:
        spoes = []
        token_ids = np.repeat(token_ids, len(subjects), 0)
        segment_ids = np.repeat(segment_ids, len(subjects), 0)
        subjects = np.array(subjects)
        # 传入subject,抽取object和predicate
        object_preds = object_model.predict([token_ids, segment_ids, subjects])
        object_preds[:, [0, -1]] *= 0
        for subject, object_pred in zip(subjects, object_preds):
            start = np.where(object_pred[:, :, 0] > 0.6)
            end = np.where(object_pred[:, :, 1] > 0.5)
            for _start, predicate1 in zip(*start):
                for _end, predicate2 in zip(*end):
                    if _start <= _end and predicate1 == predicate2:
                        spoes.append(
                            ((mapping[subject[0]][0],
                              mapping[subject[1]][-1]), predicate1,
                             (mapping[_start][0], mapping[_end][-1]))
                        )
                        break
        return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1])
                for s, p, o, in spoes]
    else:
        return []
Ejemplo n.º 19
0
 def recognize(self, text):
     tokens = tokenizer.tokenize(text, maxlen=512)
     mapping = tokenizer.rematch(text, tokens)
     token_ids = tokenizer.tokens_to_ids(tokens)
     segment_ids = [0] * len(token_ids)
     token_ids, segment_ids = to_array([token_ids], [segment_ids])
     nodes = model.predict([token_ids, segment_ids])[0]
     labels = self.decode(nodes)
     entities, starting = [], False
     for i, label in enumerate(labels):
         if label > 0:
             if label % 2 == 1:
                 starting = True
                 entities.append([[i], categories[(label - 1) // 2]])
             elif starting:
                 entities[-1][0].append(i)
             else:
                 starting = False
         else:
             starting = False
     return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities]
Ejemplo n.º 20
0
    def test_load_and_save(self):
        current_folder = os.path.abspath(
            os.path.dirname(os.path.realpath(__file__)))
        bert_path = os.path.join(current_folder, 'assets', 'bert_sample_model')

        config_path = os.path.join(bert_path, 'bert_config.json')
        checkpoint_path = os.path.join(bert_path, 'bert_model.ckpt')
        dict_path = os.path.join(bert_path, 'vocab.txt')
        bert_model = build_transformer_model(config_path=config_path,
                                             checkpoint_path=checkpoint_path,
                                             model='bert',
                                             application='encoder',
                                             return_keras_model=True)

        tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器

        # 编码测试
        token_ids, segment_ids = tokenizer.encode(u'jack play all day')
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        print('\n ===== predicting =====\n')
        print(bert_model.predict([token_ids, segment_ids]))

        # Serialize model
        _ = bert_model.to_json()
Ejemplo n.º 21
0
)  # 建立模型,加载权重

sentences = []
init_sent = u'科学技术是第一生产力。'  # 给定句子或者None
minlen, maxlen = 8, 32
steps = 10000
converged_steps = 1000
vocab_size = tokenizer._vocab_size

if init_sent is None:
    length = np.random.randint(minlen, maxlen + 1)
    tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]']
    token_ids = tokenizer.tokens_to_ids(tokens)
    segment_ids = [0] * len(token_ids)
else:
    token_ids, segment_ids = tokenizer.encode(init_sent)
    length = len(token_ids) - 2

for _ in tqdm(range(steps), desc='Sampling'):
    # Gibbs采样流程:随机mask掉一个token,然后通过MLM模型重新采样这个token。
    i = np.random.choice(length) + 1
    token_ids[i] = tokenizer._token_mask_id
    probas = model.predict(to_array([token_ids], [segment_ids]))[0, i]
    token = np.random.choice(vocab_size, p=probas)
    token_ids[i] = token
    sentences.append(tokenizer.decode(token_ids))

print(u'部分随机采样结果:')
for _ in range(10):
    print(np.random.choice(sentences[converged_steps:]))
Ejemplo n.º 22
0
def bert_feature_extract(txt):
    # 编码测试
    token_ids, segment_ids = tokenizer.encode(txt)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    # print()
    return model.predict([token_ids, segment_ids])[0][0]
Ejemplo n.º 23
0
def toids(s):
    token_ids, segment_ids = tokenizer.encode(s)
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    return [token_ids, segment_ids]
Ejemplo n.º 24
0
import numpy as np
from bert4keras.backend import keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import to_array

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
token_ids, segment_ids = to_array([token_ids], [segment_ids])

print('\n ===== predicting =====\n')
print(model.predict([token_ids, segment_ids]))
"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
    0.39056838]
Ejemplo n.º 25
0
def sentiment(text):
    token_ids, segment_ids = tokenizer.encode(text,
                                              maxlen=maxlen)  # word encoding
    token_ids, segment_ids = to_array([token_ids], [segment_ids])
    y_pred = model.predict([token_ids, segment_ids]).argmax(axis=1)
    return y_pred