class MaskedLM(object):
    def __init__(self, topK):
        self.topK = topK
        self.tokenizer = Tokenizer(BERT_VOCAB_PATH, do_lower_case='True')
        self.model = build_transformer_model(BERT_CONFIG_PATH,
                                             BERT_CHECKPOINT_PATH,
                                             with_mlm=True)

    def tokenizer_text(self, text):
        # ['[CLS]', '我', '喜', '欢', '吃', '程', '度', '的', '火', '锅', '[SEP]']
        self.toeken = self.tokenizer.tokenize(text)
        # [101, 2769, 1599, 3614, 1391, 4923, 2428, 4638, 4125, 7222, 102] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.token_ids, self.segment_ids = self.tokenizer.encode(text)

    def find_top_candidates(self, error_index):
        for i in error_index:
            # 将错误词的id换成[MASK]的id
            self.token_ids[i] = self.tokenizer._token_dict['[MASK]']
        # 第 5,6 个位置被替换为mask的ID-103,[101, 2769, 1599, 3614, 1391, 103, 103, 4638, 4125, 7222, 102]
        # 预测每一个token的概率分布 probs.shape = [len(toekn_ids),vocab_size]
        probs = self.model.predict(
            [np.array([self.token_ids]),
             np.array([self.segment_ids])])[0]

        for i in range(len(error_index)):
            # 拿到error_id
            error_id = error_index[i]
            # 取出概率分布里面,概率最大的topK个的位置id,argsort是升序,取负之后倒序
            top_k_probs = np.argsort(-probs[error_id])[:self.topK]
            candidates, find_prob = self.tokenizer.decode(
                top_k_probs), probs[error_id][top_k_probs]
            print(dict(zip(candidates, find_prob)))
class MaskedLM():
    def __init__(self,topK):
        self.topK = topK
        self.tokenizer = Tokenizer(Config.BERT_VOCAB_PATH,do_lower_case=True)
        self.model = build_transformer_model(Config.BERT_CONFIG_PATH,Config.BERT_CHECKPOINT_PATH,with_mlm = True)
        self.token_ids, self.segment_ids = self.tokenizer.encode(' ')

    def tokenizer_text(self,text):
        self.token_ids,self.segment_ids = self.tokenizer.encode(text)

    def find_topn_candidates(self,error_index):
        for i in error_index:
            self.token_ids[i] = self.tokenizer._token_dict['[MASK]'] #将待纠正的词用mask替换掉

        probs = self.model.predict([np.array([self.token_ids]),np.array([self.segment_ids])])[0]
        for i in range(len(error_index)):
            error_id = error_index[i]
            top_k_probs = np.argsort(-probs[error_id])[:self.topK]
            candidates,fin_prob = self.tokenizer.decode(top_k_probs),probs[error_id][top_k_probs]
            print(dict(zip(candidates,fin_prob)))
Beispiel #3
0
        return_keras_model=False,
        name='T5',
    )

    encoder = t5.encoder
    decoder = t5.decoder
    model = t5.model

    class AutoTitle(AutoRegressiveDecoder):
        """seq2seq解码器
        """
        @AutoRegressiveDecoder.wraps(default_rtype='probas')
        def predict(self, inputs, output_ids, states):
            c_encoded = inputs[0]
            return self.last_token(decoder).predict([c_encoded, output_ids])

        def generate(self, text, topk=1):
            c_token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
            c_encoded = encoder.predict(np.array([c_token_ids]))[0]
            output_ids = self.beam_search([c_encoded],
                                          topk=topk)  # 基于beam search
            return tokenizer.decode(output_ids)

    autotitle = AutoTitle(start_id=tokenizer._token_start_id,
                          end_id=tokenizer._token_end_id,
                          maxlen=max_t_len)

    print('原文', text)
    print('bert4keras预测' + '\t' + autotitle.generate(text))
    print('torch预测     ' + '\t' + ''.join(tokenizer.decode(output[1:])))
Beispiel #4
0
class MRCTrainer():
    def __init__(self, train_param, model_save_path):
        self.lr = train_param['learning_rate']
        self.max_p_len = train_param['max_p_len']
        self.max_q_len = train_param['max_q_len']
        self.max_a_len = train_param['max_a_len']
        self.epochs = train_param['epochs']
        self.pretrain_type = train_param['pretrain_type']
        self.batch_size = train_param['batch_size']

        self.config_path = train_param['config_path']
        self.checkpoint_path = train_param['checkpoint_path']
        self.dict_path = train_param['dict_path']
        self.model_config = train_param
        self.model_config['model_save_path'] = model_save_path
        self.model_save_path = model_save_path

        self.buildmodel()

    def masked_cross_entropy(self, y_true, y_pred):
        y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
        y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
        cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
        cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

        return cross_entropy

    def buildmodel(self):
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
        )
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)

        if self.pretrain_type == 'albert':
            model = build_transformer_model(
                config_path,
                checkpoint_path,
                model='albert',
                with_mlm=True,
                keep_tokens=self.keep_tokens,
            )
        elif self.pretrain_type == 'bert':
            model = build_transformer_model(
                config_path,
                checkpoint_path,
                model='bert',
                with_mlm=True,
                keep_tokens=self.keep_tokens,
            )
        output = Lambda(lambda x: x[:, 1:self.max_a_len + 1])(model.output)
        #print(output.shape)
        self.model = Model(model.input, output)
        self.model.compile(loss=self.masked_cross_entropy,
                           optimizer=Adam(self.lr))
        self.model.summary()

    def fit(self, train_data):

        params_file = os.path.join(self.model_save_path, 'config.json')
        with open(params_file, 'w', encoding='utf-8') as json_file:
            json.dump(self.model_config,
                      json_file,
                      indent=4,
                      ensure_ascii=False)

        evaluator = Evaluator(self.model, self.model_save_path)
        train_generator = data_generator(train_data, self.tokenizer,
                                         self.batch_size, self.max_a_len,
                                         self.max_q_len, self.max_p_len)

        self.model.fit_generator(train_generator.forfit(),
                                 steps_per_epoch=len(train_generator),
                                 epochs=epochs,
                                 callbacks=[evaluator])

    def get_ngram_set(self, x, n):
        """生成ngram合集,返回结果格式是:
        {(n-1)-gram: set([n-gram的第n个字集合])}
        """
        result = {}
        for i in range(len(x) - n + 1):
            k = tuple(x[i:i + n])
            if k[:-1] not in result:
                result[k[:-1]] = set()
            result[k[:-1]].add(k[-1])
        return result

    def gen_answer(self, question, passage):

        token_ids, segment_ids = [], []
        passage = re.sub(u' |、|;|,', ',', passage)
        p_token_ids, _ = self.tokenizer.encode(passage,
                                               max_length=self.max_p_len + 1)
        q_token_ids, _ = self.tokenizer.encode(question,
                                               max_length=self.max_q_len + 1)
        token_ids = [self.tokenizer._token_start_id]
        token_ids += [self.tokenizer._token_mask_id] * max_a_len
        token_ids += [self.tokenizer._token_end_id]
        token_ids += q_token_ids[1:] + p_token_ids[1:]
        segment_ids = [0] * len(token_ids[-1])
        token_ids = sequence_padding(token_ids)
        segment_ids = sequence_padding(segment_ids)
        probas = self.model.predict([token_ids, segment_ids])
        results = {}
        a, score = tuple(), 0.
        for i in range(max_a_len):
            idxs = list(self.get_ngram_set(token_ids, i + 1)[a])
            print("idxs", idxs)
            if self.tokenizer._token_end_id not in idxs:
                idxs.append(self.tokenizer._token_end_id)
            pi = np.zeros_like(probas[i])
            pi[idxs] = probas[i, idxs]
            a = a + (pi.argmax(), )
            score += pi.max()
            if a[-1] == self.tokenizer._token_end_id:
                break
        score = score / (i + 1)
        a = self.tokenizer.decode(a)
        if a:
            results[a] = results.get(a, []) + [score]
        results = {
            k: (np.array(v)**2).sum() / (sum(v) + 1)
            for k, v in results.items()
        }
        return results

    def evalue(self):
        result = []
        return result
Beispiel #5
0
class SynonymsGenerator(AutoRegressiveDecoder):
    """seq2seq解码器
    """
    def __init__(self, model_path, max_len=32, seed=1):
        # super().__init__()
        setup_seed(seed)
        self.config_path = os.path.join(model_path, "bert_config.json")
        self.checkpoint_path = os.path.join(model_path, "bert_model.ckpt")
        self.dict_path = os.path.join(model_path, "vocab.txt")
        self.max_len = max_len
        self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True)
        self.bert = build_transformer_model(
            self.config_path,
            self.checkpoint_path,
            with_pool='linear',
            application='unilm',
            return_keras_model=False,
        )
        self.encoder = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[0])
        self.seq2seq = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[1])
        super().__init__(start_id=None,
                         end_id=self.tokenizer._token_end_id,
                         maxlen=self.max_len)

    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, states):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate(
            [segment_ids, np.ones_like(output_ids)], 1)
        return self.seq2seq.predict([token_ids, segment_ids])[:, -1]

    def generate(self, text, n=1, topk=5):
        token_ids, segment_ids = self.tokenizer.encode(text,
                                                       max_length=self.max_len)
        output_ids = self.random_sample([token_ids, segment_ids], n, topk)
        return [self.tokenizer.decode(ids) for ids in output_ids]

    def gen_synonyms(self, text, n=100, k=20, threhold=0.75):
        """"含义: 产生sent的n个相似句,然后返回最相似的k个。
        做法:用seq2seq生成,并用encoder算相似度并排序。
        """
        r = self.generate(text, n)
        r = [i for i in set(r) if i != text]
        r = [text] + r
        X, S = [], []
        for t in r:
            x, s = self.tokenizer.encode(t)
            X.append(x)
            S.append(s)
        X = sequence_padding(X)
        S = sequence_padding(S)
        Z = self.encoder.predict([X, S])
        Z /= (Z**2).sum(axis=1, keepdims=True)**0.5
        scores = np.dot(Z[1:], Z[0])
        argsort = scores.argsort()
        scores = scores.tolist()
        # print(scores.shape)
        # return [(r[i + 1], scores[i]) for i in argsort[::-1][:k] if scores[i] > threhold]
        return [(r[i + 1], scores[i]) for i in argsort[::-1][:k]]
)  # 建立模型,加载权重

sentences = []
init_sent = u'科学技术是第一生产力。'  # 给定句子或者None
minlen, maxlen = 8, 32
steps = 10000
converged_steps = 1000
vocab_size = tokenizer._vocab_size

if init_sent is None:
    length = np.random.randint(minlen, maxlen + 1)
    tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]']
    token_ids = tokenizer.tokens_to_ids(tokens)
    segment_ids = [0] * len(token_ids)
else:
    token_ids, segment_ids = tokenizer.encode(init_sent)
    length = len(token_ids) - 2

for _ in tqdm(range(steps), desc='Sampling'):
    # Gibbs采样流程:随机mask掉一个token,然后通过MLM模型重新采样这个token。
    i = np.random.choice(length) + 1
    token_ids[i] = tokenizer._token_mask_id
    probas = model.predict(to_array([token_ids], [segment_ids]))[0, i]
    token = np.random.choice(vocab_size, p=probas)
    token_ids[i] = token
    sentences.append(tokenizer.decode(token_ids))

print(u'部分随机采样结果:')
for _ in range(10):
    print(np.random.choice(sentences[converged_steps:]))
class ReextractBertTrainHandler():
    def __init__(self, params, Train=False):
        self.bert_config_path = model_root_path + "chinese_L-12_H-768_A-12/bert_config.json"
        self.bert_checkpoint_path = model_root_path + "chinese_L-12_H-768_A-12/bert_model.ckpt"
        self.bert_vocab_path = model_root_path + "chinese_L-12_H-768_A-12/vocab.txt"
        self.tokenizer = Tokenizer(self.bert_vocab_path, do_lower_case=True)
        self.model_path = model_root_path + "best_model.weights"
        self.params_path = model_root_path + 'params.json'
        gpu_id = params.get("gpu_id", None)
        self._set_gpu_id(gpu_id)  # 设置训练的GPU_ID
        self.memory_fraction = params.get('memory_fraction')
        if Train:
            self.train_data_file_path = params.get('train_data_path')
            self.valid_data_file_path = params.get('valid_data_path')
            self.maxlen = params.get('maxlen', 128)
            self.batch_size = params.get('batch_size', 32)
            self.epoch = params.get('epoch')
            self.data_process()
        else:
            load_params = json.load(open(self.params_path, encoding='utf-8'))
            self.maxlen = load_params.get('maxlen')
            self.num_classes = load_params.get('num_classes')
            self.p2s_dict = load_params.get('p2s_dict')
            self.i2p_dict = load_params.get('i2p_dict')
            self.p2o_dict = load_params.get('p2o_dict')
        self.build_model()
        if not Train:
            self.load_model()

    def _set_gpu_id(self, gpu_id):
        if gpu_id:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

    def data_process(self):
        self.train_data, self.valid_data, self.p2s_dict, self.p2o_dict, self.i2p_dict, self.p2i_dict = data_process(
            self.train_data_file_path, self.valid_data_file_path, self.maxlen, self.params_path)
        self.num_classes = len(self.i2p_dict)
        self.train_generator = Data_Generator(self.train_data, self.batch_size, self.tokenizer, self.p2i_dict,
                                              self.maxlen)

    def extrac_subject(self, inputs):
        """根据subject_ids从output中取出subject的向量表征
        """
        output, subject_ids = inputs
        subject_ids = K.cast(subject_ids, 'int32')
        start = batch_gather(output, subject_ids[:, :1])
        end = batch_gather(output, subject_ids[:, 1:])
        subject = K.concatenate([start, end], 2)
        return subject[:, 0]

    def build_model(self):
        import tensorflow as tf
        from keras.backend.tensorflow_backend import set_session
        config = tf.ConfigProto()
        config.gpu_options.allocator_type = 'BFC'  # A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
        if self.memory_fraction:
            config.gpu_options.per_process_gpu_memory_fraction = self.memory_fraction
            config.gpu_options.allow_growth = False
        else:
            config.gpu_options.allow_growth = True
        set_session(tf.Session(config=config))

        # 补充输入
        subject_labels = Input(shape=(None, 2), name='Subject-Labels')
        subject_ids = Input(shape=(2,), name='Subject-Ids')
        object_labels = Input(shape=(None, self.num_classes, 2), name='Object-Labels')
        # 加载预训练模型
        bert = build_transformer_model(
            config_path=self.bert_config_path,
            checkpoint_path=self.bert_checkpoint_path,
            return_keras_model=False,
        )
        # 预测subject
        output = Dense(units=2,
                       activation='sigmoid',
                       kernel_initializer=bert.initializer)(bert.model.output)
        subject_preds = Lambda(lambda x: x ** 2)(output)
        self.subject_model = Model(bert.model.inputs, subject_preds)
        # 传入subject,预测object
        # 通过Conditional Layer Normalization将subject融入到object的预测中
        output = bert.model.layers[-2].get_output_at(-1)
        subject = Lambda(self.extrac_subject)([output, subject_ids])
        output = LayerNormalization(conditional=True)([output, subject])
        output = Dense(units=self.num_classes * 2,
                       activation='sigmoid',
                       kernel_initializer=bert.initializer)(output)
        output = Lambda(lambda x: x ** 4)(output)
        object_preds = Reshape((-1, self.num_classes, 2))(output)
        self.object_model = Model(bert.model.inputs + [subject_ids], object_preds)
        # 训练模型
        self.train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels],
                                 [subject_preds, object_preds])
        mask = bert.model.get_layer('Embedding-Token').output_mask
        mask = K.cast(mask, K.floatx())
        subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
        subject_loss = K.mean(subject_loss, 2)
        subject_loss = K.sum(subject_loss * mask) / K.sum(mask)
        object_loss = K.binary_crossentropy(object_labels, object_preds)
        object_loss = K.sum(K.mean(object_loss, 3), 2)
        object_loss = K.sum(object_loss * mask) / K.sum(mask)
        self.train_model.add_loss(subject_loss + object_loss)
        AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA')
        self.optimizer = AdamEMA(lr=1e-4)
        self.train_model.compile(optimizer=self.optimizer)

    def load_model(self):
        self.train_model.load_weights(self.model_path)

    def predict(self, text):
        """
        抽取输入text所包含的三元组
        text:str(<离开>是由张宇谱曲,演唱)
        """
        tokens = self.tokenizer.tokenize(text, max_length=self.maxlen)
        token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.maxlen)
        # 抽取subject
        subject_preds = self.subject_model.predict([[token_ids], [segment_ids]])
        start = np.where(subject_preds[0, :, 0] > 0.6)[0]
        end = np.where(subject_preds[0, :, 1] > 0.5)[0]
        subjects = []
        for i in start:
            j = end[end >= i]
            if len(j) > 0:
                j = j[0]
                subjects.append((i, j))
        if subjects:
            spoes = []
            token_ids = np.repeat([token_ids], len(subjects), 0)
            segment_ids = np.repeat([segment_ids], len(subjects), 0)
            subjects = np.array(subjects)
            # 传入subject,抽取object和predicate
            object_preds = self.object_model.predict([token_ids, segment_ids, subjects])
            for subject, object_pred in zip(subjects, object_preds):
                start = np.where(object_pred[:, :, 0] > 0.6)
                end = np.where(object_pred[:, :, 1] > 0.5)
                for _start, predicate1 in zip(*start):
                    for _end, predicate2 in zip(*end):
                        if _start <= _end and predicate1 == predicate2:
                            spoes.append((subject, predicate1, (_start, _end)))
                            break
            return [
                (
                    [self.tokenizer.decode(token_ids[0, s[0]:s[1] + 1], tokens[s[0]:s[1] + 1]),
                     self.p2s_dict[self.i2p_dict[p]]],
                    self.i2p_dict[p],
                    [self.tokenizer.decode(token_ids[0, o[0]:o[1] + 1], tokens[o[0]:o[1] + 1]),
                     self.p2o_dict[self.i2p_dict[p]]],
                    (s[0], s[1] + 1),
                    (o[0], o[1] + 1)
                ) for s, p, o in spoes
            ]
        else:
            return []

    def train(self):
        evaluator = Evaluator(self.train_model, self.model_path, self.tokenizer, self.predict, self.optimizer,
                              self.valid_data)

        self.train_model.fit_generator(self.train_generator.forfit(),
                                       steps_per_epoch=len(self.train_generator),
                                       epochs=self.epoch,
                                       callbacks=[evaluator])
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path,
                                with_mlm=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
Beispiel #9
0
    可以尝试换用内积或者cos距离,结果差不多。
    """
    return np.sqrt(((x - y)**2).sum())


batch_token_ids = np.array([token_ids] * (2 * length - 1))
batch_segment_ids = np.zeros_like(batch_token_ids)

for i in range(length):
    if i > 0:
        batch_token_ids[2 * i - 1, i] = tokenizer._token_mask_id
        batch_token_ids[2 * i - 1, i + 1] = tokenizer._token_mask_id
    batch_token_ids[2 * i, i + 1] = tokenizer._token_mask_id

vectors = model.predict([batch_token_ids, batch_segment_ids])

threshold = 8
word_token_ids = [[token_ids[1]]]
for i in range(1, length):
    d1 = dist(vectors[2 * i, i + 1], vectors[2 * i - 1, i + 1])
    d2 = dist(vectors[2 * i - 2, i], vectors[2 * i - 1, i])
    d = (d1 + d2) / 2
    if d >= threshold:
        word_token_ids[-1].append(token_ids[i + 1])
    else:
        word_token_ids.append([token_ids[i + 1]])

words = [tokenizer.decode(ids) for ids in word_token_ids]
print(words)
# 结果:[u'大肠杆菌', u'是', u'人和', u'许多', u'动物', u'肠道', u'中最', u'主要', u'且数量', u'最多', u'的', u'一种', u'细菌']