def evaluate_report(df_data):
    model = tf.keras.models.load_model('{}-model.h5'.format(model_name))
    true_y_list = [i for i in df_data["new_label"].tolist()]
    pred_y_list = []
    for text in df_data["text"].tolist():
        tokenizer = Tokenizer(dict_path, do_lower_case=True)
        token_ids, segment_ids = tokenizer.encode(first_text=text,
                                                  maxlen=maxlen)
        token_list = sequence_padding([token_ids])
        segment_list = sequence_padding([segment_ids])
        label = model.predict([np.array(token_list),
                               np.array(segment_list)]).argmax(axis=1)
        pred_y_list.append(label[0])

    with open("label.json", "r", encoding="utf-8") as f:
        labels = json.loads(f.read())
    target_name_list = list(labels.values())
    report = classification_report(true_y_list,
                                   pred_y_list,
                                   target_names=target_name_list,
                                   digits=4,
                                   output_dict=True)
    print(report)
    df = pd.DataFrame(report).transpose()
    df.to_csv("{}-report.csv".format(model_type),
              encoding='utf_8_sig',
              index=True)
class MaskedLM(object):
    def __init__(self, topK):
        self.topK = topK
        self.tokenizer = Tokenizer(BERT_VOCAB_PATH, do_lower_case='True')
        self.model = build_transformer_model(BERT_CONFIG_PATH,
                                             BERT_CHECKPOINT_PATH,
                                             with_mlm=True)

    def tokenizer_text(self, text):
        # ['[CLS]', '我', '喜', '欢', '吃', '程', '度', '的', '火', '锅', '[SEP]']
        self.toeken = self.tokenizer.tokenize(text)
        # [101, 2769, 1599, 3614, 1391, 4923, 2428, 4638, 4125, 7222, 102] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.token_ids, self.segment_ids = self.tokenizer.encode(text)

    def find_top_candidates(self, error_index):
        for i in error_index:
            # 将错误词的id换成[MASK]的id
            self.token_ids[i] = self.tokenizer._token_dict['[MASK]']
        # 第 5,6 个位置被替换为mask的ID-103,[101, 2769, 1599, 3614, 1391, 103, 103, 4638, 4125, 7222, 102]
        # 预测每一个token的概率分布 probs.shape = [len(toekn_ids),vocab_size]
        probs = self.model.predict(
            [np.array([self.token_ids]),
             np.array([self.segment_ids])])[0]

        for i in range(len(error_index)):
            # 拿到error_id
            error_id = error_index[i]
            # 取出概率分布里面,概率最大的topK个的位置id,argsort是升序,取负之后倒序
            top_k_probs = np.argsort(-probs[error_id])[:self.topK]
            candidates, find_prob = self.tokenizer.decode(
                top_k_probs), probs[error_id][top_k_probs]
            print(dict(zip(candidates, find_prob)))
Example #3
0
class ExtractBertFeatures:
    def __init__(self, model_name='uncased_L-4_H-256_A-4'):
        bert_model_dir = os.path.join(CUR_PASTH, 'data', model_name)
        self.load_model(model_dir=bert_model_dir)
        pass

    def load_model(self, model_dir):
        config_name = os.path.join(model_dir, 'bert_config.json')
        checkpoint_name = os.path.join(model_dir, 'bert_model.ckpt')
        vocab_name = os.path.join(model_dir, 'vocab.txt')
        self.tokenizer = Tokenizer(vocab_name, do_lower_case=True)  # 建立分词器
        self.model = build_transformer_model(config_name,
                                             checkpoint_name)  # 建立模型,加载权重

    def predict(self,
                x,
                second_text=None,
                max_length=None,
                first_length=None,
                second_length=None,
                use_multiprocessing=False):
        token_ids, segment_ids = self.tokenizer.encode(x, second_text,
                                                       max_length,
                                                       first_length,
                                                       second_length)
        features = self.model.predict(
            [np.array([token_ids]),
             np.array([segment_ids])])
        return features
Example #4
0
def build_model():
    config_path = GUWEN_CONFIG_PATH if use_guwenbert else ROBERTA_CONFIG_PATH
    checkpoint_path = GUWEN_CHECKPOINT_PATH if use_guwenbert else ROBERTA_CHECKPOINT_PATH
    dict_path = GUWEN_DICT_PATH if use_guwenbert else ROBERTA_DICT_PATH

    token_dict, keep_tokens = load_vocab(
        dict_path=dict_path,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    tokenizer = Tokenizer(token_dict, do_lower_case=True)

    model = build_transformer_model(
        config_path,
        checkpoint_path,
        application='unilm',
        # keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
    )

    # 加载训练好的模型
    model.load_weights(BEST_MODEL_PATH)

    autotitle = AutoTitle(start_id=None,
                          end_id=tokenizer._token_end_id,
                          maxlen=50)

    text = '却话巴山夜雨时'
    token_ids, segment_ids = tokenizer.encode(text)
    inputs = np.array([token_ids, segment_ids])
    inputs = [np.array([i]) for i in inputs]
    print(autotitle.predict(inputs, np.empty((1, 0), dtype=int), states=None))
    print(autotitle.generate("却话巴山夜雨时"))
    return autotitle
Example #5
0
def pre_model(text):
    tokenizer = Tokenizer(dict_path, do_lower_case=True)
    token_ids, segment_ids = tokenizer.encode(first_text=text, maxlen=maxlen)
    token_list = sequence_padding([token_ids])
    segment_list = sequence_padding([segment_ids])
    label = model.predict([np.array(token_list),
                           np.array(segment_list)]).argmax(axis=1)
    return int(label[0])
class MaskedLM():
    def __init__(self,topK):
        self.topK = topK
        self.tokenizer = Tokenizer(Config.BERT_VOCAB_PATH,do_lower_case=True)
        self.model = build_transformer_model(Config.BERT_CONFIG_PATH,Config.BERT_CHECKPOINT_PATH,with_mlm = True)
        self.token_ids, self.segment_ids = self.tokenizer.encode(' ')

    def tokenizer_text(self,text):
        self.token_ids,self.segment_ids = self.tokenizer.encode(text)

    def find_topn_candidates(self,error_index):
        for i in error_index:
            self.token_ids[i] = self.tokenizer._token_dict['[MASK]'] #将待纠正的词用mask替换掉

        probs = self.model.predict([np.array([self.token_ids]),np.array([self.segment_ids])])[0]
        for i in range(len(error_index)):
            error_id = error_index[i]
            top_k_probs = np.argsort(-probs[error_id])[:self.topK]
            candidates,fin_prob = self.tokenizer.decode(top_k_probs),probs[error_id][top_k_probs]
            print(dict(zip(candidates,fin_prob)))
def vec4(tex):
    tokenizer = Tokenizer(dict_path, do_lower_case=True)
    token_ids, segment_ids = tokenizer.encode(tex)
    print("Token ID: " + str(token_ids))
    print("Segment ID:" + str(segment_ids))

    print('\n ===== predicting =====\n')
    vec1 = model.predict([np.array([token_ids]), np.array([segment_ids])])
    print(vec1.shape)

    return vec1
class Simparams:
    def __init__(self):
        self.max_seq_length = 128
        self.corpus_text = 'data/corpus3.json'
        self.config_path = 'data/chinese_simbert_L-12_H-768_A-12/bert_config.json'
        self.checkpoint_path = 'data/chinese_simbert_L-12_H-768_A-12/bert_model.ckpt'
        self.vocab_file = 'data/chinese_simbert_L-12_H-768_A-12/vocab.txt'
        self.tokenizer = Tokenizer(self.vocab_file, do_lower_case=True)  # 建立分词器

        # 加载模型
        self.bert = build_transformer_model(
            self.config_path,
            self.checkpoint_path,
            with_pool='linear',
            application='unilm',
            return_keras_model=False,
        )

        self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0])
        # 加载数据库语料
        with open(self.corpus_text, 'r', encoding='utf-8') as load_f:
            self.classes = json.load(load_f)

        self.corpus = eval(self.classes)
        self.list_vec = []
        self.list_corpus = []
        for c, v in self.corpus.items():
            self.list_vec.append(v)
            self.list_corpus.append(c)
        # 新增语料
        df = pd.read_excel('data/新增数据.xlsx')
        for vn in range(len(df)):
            self.list_corpus.append(df['新增语料'][vn])
            self.list_vec.append(self.vec(df['新增语料'][vn]))

        self.list_vec = np.concatenate(self.list_vec).reshape(-1, 768)

    def vec(self, query):
        token_ids, segment_ids = self.tokenizer.encode(query, max_length=self.max_seq_length)
        vec = self.encoder.predict([[token_ids], [segment_ids]])[0]
        # 求单位向量
        vec /= (vec ** 2).sum() ** 0.5
        return vec
Example #9
0
    def test_load_and_save(self):
        current_folder = os.path.abspath(
            os.path.dirname(os.path.realpath(__file__)))
        bert_path = os.path.join(current_folder, 'assets', 'bert_sample_model')

        config_path = os.path.join(bert_path, 'bert_config.json')
        checkpoint_path = os.path.join(bert_path, 'bert_model.ckpt')
        dict_path = os.path.join(bert_path, 'vocab.txt')
        bert_model = build_transformer_model(config_path=config_path,
                                             checkpoint_path=checkpoint_path,
                                             model='bert',
                                             application='encoder',
                                             return_keras_model=True)

        tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器

        # 编码测试
        token_ids, segment_ids = tokenizer.encode(u'jack play all day')
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        print('\n ===== predicting =====\n')
        print(bert_model.predict([token_ids, segment_ids]))

        # Serialize model
        _ = bert_model.to_json()
# BERT配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 文本编码
text = u'计算机的鼠标有什么比较特殊的用途呢'
words = jieba.lcut(text)
spans = []
token_ids = [tokenizer._token_start_id]
for w in words:
    w_ids = tokenizer.encode(w)[0][1:-1]
    token_ids.extend(w_ids)
    spans.append((len(token_ids) - len(w_ids), len(token_ids)))

token_ids.append(tokenizer._token_end_id)
length = len(spans)


def dist(x, y):
    """距离函数(默认用欧氏距离)
    可以尝试换用内积或者cos距离,结果差不多。
    """
    return np.sqrt(((x - y)**2).sum())


batch_token_ids = np.array([token_ids] * (length * (length + 1) // 2))
# ========== data preparation: ==========
labels = sorted(list(set(df.label)))
assert len(labels) == num_classes, 'wrong num of classes!'
label2idx = {name: i for name, i in zip(labels, range(num_classes))}
#%%
print('start tokenizing...')
t = time.time()
X_token = []
X_seg = []
y = []
i = 0
for content, label in zip(list(df.content), list(df.label)):
    i += 1
    if i % 1000 == 0:
        print(i)
    token_ids, seg_ids = tokenizer.encode(content, maxlen=maxlen)
    X_token.append(token_ids)
    X_seg.append(seg_ids)
    y.append(label2idx[label])

# the sequences we obtained from above may have different length, so use Padding:
X_token = sequence_padding(X_token)
X_seg = sequence_padding(X_seg)
y = np.array(y)
print('tokenizing time cost:', time.time() - t, 's.')

#%%
# ========== model traing: ==========
old_list = []
ls_list = []
lcm_list = []
Example #12
0
class CmtClassifier:

    def __init__(self, model_type, model_para_paths, label_filepath, origin):
        # 属性赋值
        self.model_type = model_type
        self.model_para_paths = model_para_paths
        self.label_filepath = label_filepath

        # 加载编号-标签字典
        with open(label_filepath, "r") as fin:
            reader = csv.reader(fin)
            self.label_dict = {int(row[0]): row[1] for row in reader}

        # 创建分词器
        self.tokenizer = Tokenizer(model_para_paths[2], do_lower_case=True)

        if origin:  # 表示构建一个还未经过微调的模型
            # 模型的上游
            bert = build_transformer_model(
                config_path=model_para_paths[0],
                checkpoint_path=model_para_paths[1],
                model=model_type,
                return_keras_model=False,
            )
            # 取[CLS]这个token的输出向量作为下游任务的输入
            output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
            # 模型的下游
            output = Dense(
                units=len(self.label_dict),
                activation='softmax',
                kernel_initializer=bert.initializer
            )(output)
            # 连接模型的输入与输出
            self.model = keras.models.Model(bert.model.input, output)
        else:  # 表示模型已经过微调
            self.model = build_transformer_model(
                config_path=model_para_paths[0],
                checkpoint_path=model_para_paths[1],
                model=model_type,
                return_keras_model=False,
            )
        self.model.summary()  # 显示模型结构

    def fit(self, train_filepath, valid_filepath, temp_save_path,
            maxlen=128, learning_rate=1e-4, epochs=5, batch_size=32):
        train_data = load_data(train_filepath)
        train_generator = CmtDataGenerator(train_data, batch_size, self.tokenizer)

        callbacks = None
        if valid_filepath != "" and valid_filepath is not None \
                and temp_save_path != "" and temp_save_path is not None:
            valid_data = load_data(valid_filepath)
            valid_generator = CmtDataGenerator(valid_data, batch_size, self.tokenizer)
            evaluator = Evaluator(self.model, valid_generator, temp_save_path)
            callbacks = [evaluator]

        AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

        self.model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=AdamLR(learning_rate=learning_rate, lr_schedule={
                1000: 1,
                2000: 0.1
            }),
            metrics=['accuracy'],
        )

        self.model.fit(
            train_generator.forfit(),
            steps_per_epoch=len(train_generator),
            epochs=epochs,
            callbacks=callbacks
        )

        if callbacks is not None:
            self.model.load_weights(temp_save_path)

    def clean_data(self, input):
        at_pattern = re.compile("//@.*?:")
        url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")

        # 去掉Email和URL
        input = re.sub(at_pattern, "", input)
        input = re.sub(url_pattern, "", input)
        # 去掉首尾空格
        return input.strip()

    def predict(self, input):
        tids, sids = self.encode(self.clean_data(input), 128)
        y_pred = self.model.predict([tids, sids]).argmax(axis=1)
        return y_pred, self.label_dict[y_pred]

    def encode(self, input, maxlen=None):
        return self.tokenizer.encode(input, maxlen)

    def load_weights(self, param_path):
        self.model.load_weights(param_path)

    def save_model(self, savepath):
        self.model.save(savepath)
Example #13
0
class SynonymsGenerator(AutoRegressiveDecoder):
    """seq2seq解码器
    """
    def __init__(self, model_path, max_len=32, seed=1):
        # super().__init__()
        setup_seed(seed)
        self.config_path = os.path.join(model_path, "bert_config.json")
        self.checkpoint_path = os.path.join(model_path, "bert_model.ckpt")
        self.dict_path = os.path.join(model_path, "vocab.txt")
        self.max_len = max_len
        self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True)
        self.bert = build_transformer_model(
            self.config_path,
            self.checkpoint_path,
            with_pool='linear',
            application='unilm',
            return_keras_model=False,
        )
        self.encoder = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[0])
        self.seq2seq = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[1])
        super().__init__(start_id=None,
                         end_id=self.tokenizer._token_end_id,
                         maxlen=self.max_len)

    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, states):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate(
            [segment_ids, np.ones_like(output_ids)], 1)
        return self.seq2seq.predict([token_ids, segment_ids])[:, -1]

    def generate(self, text, n=1, topk=5):
        token_ids, segment_ids = self.tokenizer.encode(text,
                                                       max_length=self.max_len)
        output_ids = self.random_sample([token_ids, segment_ids], n, topk)
        return [self.tokenizer.decode(ids) for ids in output_ids]

    def gen_synonyms(self, text, n=100, k=20, threhold=0.75):
        """"含义: 产生sent的n个相似句,然后返回最相似的k个。
        做法:用seq2seq生成,并用encoder算相似度并排序。
        """
        r = self.generate(text, n)
        r = [i for i in set(r) if i != text]
        r = [text] + r
        X, S = [], []
        for t in r:
            x, s = self.tokenizer.encode(t)
            X.append(x)
            S.append(s)
        X = sequence_padding(X)
        S = sequence_padding(S)
        Z = self.encoder.predict([X, S])
        Z /= (Z**2).sum(axis=1, keepdims=True)**0.5
        scores = np.dot(Z[1:], Z[0])
        argsort = scores.argsort()
        scores = scores.tolist()
        # print(scores.shape)
        # return [(r[i + 1], scores[i]) for i in argsort[::-1][:k] if scores[i] > threhold]
        return [(r[i + 1], scores[i]) for i in argsort[::-1][:k]]
Example #14
0
# 建立加载模型
bert = build_transformer_model(
    config_path,
    checkpoint_path,
    with_pool='linear',
    application='unilm',
    return_keras_model=False,
)

encoder = keras.models.Model(bert.model.inputs, bert.model.outputs[0])
seq2seq = keras.models.Model(bert.model.inputs, bert.model.outputs[1])

ques = ['姚明的女儿', '姚明父亲']
X, S = [], []
for que in ques:
    x, s = tokenizer.encode(que)
    X.append(x)
    S.append(s)
X = sequence_padding(X)
S = sequence_padding(S)
with graph.as_default():
    Z = encoder.predict([X, S])


class SynonymsGenerator(AutoRegressiveDecoder):
    """seq2seq解码器
    """
    @AutoRegressiveDecoder.set_rtype('probas')
    def predict(self, inputs, output_ids, step):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
Example #15
0
    token_dict, keep_tokens, compound_tokens = json.load(
        open(seq2seq_config_json))
else:
    # 加载并精简词表
    token_dict, keep_tokens = load_vocab(
        dict_path=nezha_dict_path,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    pure_tokenizer = Tokenizer(token_dict.copy(), do_lower_case=True)
    user_dict = []
    for w in load_user_dict(user_dict_path) + load_user_dict(user_dict_path_2):
        if w not in token_dict:
            token_dict[w] = len(token_dict)
            user_dict.append(w)
    compound_tokens = [pure_tokenizer.encode(w)[0][1:-1] for w in user_dict]
    json.dump([token_dict, keep_tokens, compound_tokens],
              open(seq2seq_config_json, 'w'))

tokenizer = Tokenizer(token_dict,
                      do_lower_case=True,
                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))


def generate_copy_labels(source, target):
    """构建copy机制对应的label
    """
    mapping = longest_common_subsequence(source, target)[1]
    source_labels = [0] * len(source)
    target_labels = [0] * len(target)
    i0, j0 = -2, -2
Example #16
0
    # layer_name = 'Transformer-9-FeedForward-Norm' #获取层的名称
    # intermediate_layer_model = Model(inputs=model.input,
    #                              outputs=model.get_layer(layer_name).output)#创建的新模型
    for layers in model.layers:
        print(layers.name)
    maxlen = 70

    # 读取处理数据
    f1 = 'D:/cluster/data/train.json'
    res = load_data(f1)
    output = []
    print('开始提取')
    # 根据提取特征的方法获得词向量
    for r in res:
        token_ids, segment_ids = tokenizer.encode(r, max_length=maxlen)

        if vector_name == 'cls':
            cls_vector = model.predict(
                [np.array([token_ids]),
                 np.array([segment_ids])])[0][0]
            output.append(cls_vector)
        elif vector_name == 'mean':
            new = []
            vector = model.predict(
                [np.array([token_ids]),
                 np.array([segment_ids])])[0]
            for i in range(768):
                temp = 0
                for j in range(len(vector)):
                    temp += vector[j][i]
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path,
                                with_mlm=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
class ReextractBertTrainHandler():
    def __init__(self, params, Train=False):
        self.bert_config_path = model_root_path + "chinese_L-12_H-768_A-12/bert_config.json"
        self.bert_checkpoint_path = model_root_path + "chinese_L-12_H-768_A-12/bert_model.ckpt"
        self.bert_vocab_path = model_root_path + "chinese_L-12_H-768_A-12/vocab.txt"
        self.tokenizer = Tokenizer(self.bert_vocab_path, do_lower_case=True)
        self.model_path = model_root_path + "best_model.weights"
        self.params_path = model_root_path + 'params.json'
        gpu_id = params.get("gpu_id", None)
        self._set_gpu_id(gpu_id)  # 设置训练的GPU_ID
        self.memory_fraction = params.get('memory_fraction')
        if Train:
            self.train_data_file_path = params.get('train_data_path')
            self.valid_data_file_path = params.get('valid_data_path')
            self.maxlen = params.get('maxlen', 128)
            self.batch_size = params.get('batch_size', 32)
            self.epoch = params.get('epoch')
            self.data_process()
        else:
            load_params = json.load(open(self.params_path, encoding='utf-8'))
            self.maxlen = load_params.get('maxlen')
            self.num_classes = load_params.get('num_classes')
            self.p2s_dict = load_params.get('p2s_dict')
            self.i2p_dict = load_params.get('i2p_dict')
            self.p2o_dict = load_params.get('p2o_dict')
        self.build_model()
        if not Train:
            self.load_model()

    def _set_gpu_id(self, gpu_id):
        if gpu_id:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

    def data_process(self):
        self.train_data, self.valid_data, self.p2s_dict, self.p2o_dict, self.i2p_dict, self.p2i_dict = data_process(
            self.train_data_file_path, self.valid_data_file_path, self.maxlen, self.params_path)
        self.num_classes = len(self.i2p_dict)
        self.train_generator = Data_Generator(self.train_data, self.batch_size, self.tokenizer, self.p2i_dict,
                                              self.maxlen)

    def extrac_subject(self, inputs):
        """根据subject_ids从output中取出subject的向量表征
        """
        output, subject_ids = inputs
        subject_ids = K.cast(subject_ids, 'int32')
        start = batch_gather(output, subject_ids[:, :1])
        end = batch_gather(output, subject_ids[:, 1:])
        subject = K.concatenate([start, end], 2)
        return subject[:, 0]

    def build_model(self):
        import tensorflow as tf
        from keras.backend.tensorflow_backend import set_session
        config = tf.ConfigProto()
        config.gpu_options.allocator_type = 'BFC'  # A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
        if self.memory_fraction:
            config.gpu_options.per_process_gpu_memory_fraction = self.memory_fraction
            config.gpu_options.allow_growth = False
        else:
            config.gpu_options.allow_growth = True
        set_session(tf.Session(config=config))

        # 补充输入
        subject_labels = Input(shape=(None, 2), name='Subject-Labels')
        subject_ids = Input(shape=(2,), name='Subject-Ids')
        object_labels = Input(shape=(None, self.num_classes, 2), name='Object-Labels')
        # 加载预训练模型
        bert = build_transformer_model(
            config_path=self.bert_config_path,
            checkpoint_path=self.bert_checkpoint_path,
            return_keras_model=False,
        )
        # 预测subject
        output = Dense(units=2,
                       activation='sigmoid',
                       kernel_initializer=bert.initializer)(bert.model.output)
        subject_preds = Lambda(lambda x: x ** 2)(output)
        self.subject_model = Model(bert.model.inputs, subject_preds)
        # 传入subject,预测object
        # 通过Conditional Layer Normalization将subject融入到object的预测中
        output = bert.model.layers[-2].get_output_at(-1)
        subject = Lambda(self.extrac_subject)([output, subject_ids])
        output = LayerNormalization(conditional=True)([output, subject])
        output = Dense(units=self.num_classes * 2,
                       activation='sigmoid',
                       kernel_initializer=bert.initializer)(output)
        output = Lambda(lambda x: x ** 4)(output)
        object_preds = Reshape((-1, self.num_classes, 2))(output)
        self.object_model = Model(bert.model.inputs + [subject_ids], object_preds)
        # 训练模型
        self.train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels],
                                 [subject_preds, object_preds])
        mask = bert.model.get_layer('Embedding-Token').output_mask
        mask = K.cast(mask, K.floatx())
        subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
        subject_loss = K.mean(subject_loss, 2)
        subject_loss = K.sum(subject_loss * mask) / K.sum(mask)
        object_loss = K.binary_crossentropy(object_labels, object_preds)
        object_loss = K.sum(K.mean(object_loss, 3), 2)
        object_loss = K.sum(object_loss * mask) / K.sum(mask)
        self.train_model.add_loss(subject_loss + object_loss)
        AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA')
        self.optimizer = AdamEMA(lr=1e-4)
        self.train_model.compile(optimizer=self.optimizer)

    def load_model(self):
        self.train_model.load_weights(self.model_path)

    def predict(self, text):
        """
        抽取输入text所包含的三元组
        text:str(<离开>是由张宇谱曲,演唱)
        """
        tokens = self.tokenizer.tokenize(text, max_length=self.maxlen)
        token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.maxlen)
        # 抽取subject
        subject_preds = self.subject_model.predict([[token_ids], [segment_ids]])
        start = np.where(subject_preds[0, :, 0] > 0.6)[0]
        end = np.where(subject_preds[0, :, 1] > 0.5)[0]
        subjects = []
        for i in start:
            j = end[end >= i]
            if len(j) > 0:
                j = j[0]
                subjects.append((i, j))
        if subjects:
            spoes = []
            token_ids = np.repeat([token_ids], len(subjects), 0)
            segment_ids = np.repeat([segment_ids], len(subjects), 0)
            subjects = np.array(subjects)
            # 传入subject,抽取object和predicate
            object_preds = self.object_model.predict([token_ids, segment_ids, subjects])
            for subject, object_pred in zip(subjects, object_preds):
                start = np.where(object_pred[:, :, 0] > 0.6)
                end = np.where(object_pred[:, :, 1] > 0.5)
                for _start, predicate1 in zip(*start):
                    for _end, predicate2 in zip(*end):
                        if _start <= _end and predicate1 == predicate2:
                            spoes.append((subject, predicate1, (_start, _end)))
                            break
            return [
                (
                    [self.tokenizer.decode(token_ids[0, s[0]:s[1] + 1], tokens[s[0]:s[1] + 1]),
                     self.p2s_dict[self.i2p_dict[p]]],
                    self.i2p_dict[p],
                    [self.tokenizer.decode(token_ids[0, o[0]:o[1] + 1], tokens[o[0]:o[1] + 1]),
                     self.p2o_dict[self.i2p_dict[p]]],
                    (s[0], s[1] + 1),
                    (o[0], o[1] + 1)
                ) for s, p, o in spoes
            ]
        else:
            return []

    def train(self):
        evaluator = Evaluator(self.train_model, self.model_path, self.tokenizer, self.predict, self.optimizer,
                              self.valid_data)

        self.train_model.fit_generator(self.train_generator.forfit(),
                                       steps_per_epoch=len(self.train_generator),
                                       epochs=self.epoch,
                                       callbacks=[evaluator])
Example #19
0
class XlnetEmbedding(BaseEmbedding):
    def __init__(self, hyper_parameters):
        self.layer_indexes = hyper_parameters['embedding'].get('layer_indexes', [24])
        self.xlnet_embed = hyper_parameters['embedding'].get('xlnet_embed', {})
        self.batch_size = hyper_parameters['model'].get('batch_size', 2)
        super().__init__(hyper_parameters)

    def build_config(self, path_config: str=None):
        # reader config of bert
        self.configs = {}
        if path_config is not None:
            self.configs.update(json.load(open(path_config)))

    def build(self):
        from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects
        from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI

        self.embedding_type = 'xlnet'
        self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt')
        self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json')
        self.spiece_model = os.path.join(self.corpus_path, 'spiece.model')

        self.attention_type = self.xlnet_embed.get('attention_type', 'bi')  # or 'uni'
        self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI
        self.memory_len = self.xlnet_embed.get('memory_len', 0)
        self.target_len = self.xlnet_embed.get('target_len', 5)
        print('load xlnet model start!')
        # 模型加载
        model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path,
                                                   attention_type=self.attention_type,
                                                   in_train_phase=self.trainable,
                                                   config_path=self.config_path,
                                                   memory_len=self.memory_len,
                                                   target_len=self.target_len,
                                                   batch_size=self.batch_size,
                                                   mask_index=0)
        #
        set_custom_objects()
        self.build_config(self.config_path)
        # 字典加载
        self.tokenizer = Tokenizer(self.spiece_model)
        # # debug时候查看layers
        # self.model_layers = model.layers
        # len_layers = self.model_layers.__len__()
        # print(len_layers)
        num_hidden_layers = self.configs.get("n_layer", 12)

        layer_real = [i for i in range(num_hidden_layers)] + [-i for i in range(num_hidden_layers)]
        # 简要判别一下
        self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes]
        output_layer = "FeedForward-Normal-{0}"
        layer_dict = [model.get_layer(output_layer.format(i + 1)).get_output_at(node_index=0)
                          for i in range(num_hidden_layers)]

        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,取得不正确的话就取倒数第二层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in layer_real:
                encoder_layer = layer_dict[self.layer_indexes[0]]
            else:
                encoder_layer = layer_dict[-1]
        # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数
        else:
            # layer_indexes must be [0, 1, 2,3,......24]
            all_layers = [layer_dict[lay] if lay in layer_real
                          else layer_dict[-1] # 如果给出不正确,就默认输出倒数第一层
                          for lay in self.layer_indexes]
            print(self.layer_indexes)
            print(all_layers)
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)

            # def xlnet_concat(x):
            #     x_concat = K.concatenate(x, axis=1)
            #     return x_concat
            # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers)

        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(self.input, self.output)
        print("load KerasXlnetEmbedding end")
        model.summary(132)

        self.embedding_size = self.model.output_shape[-1]
        self.vocab_size = len(self.tokenizer.sp)

    def sentence2idx(self, text, second_text=None):
        # text = extract_chinese(str(text).upper())
        text = str(text).upper()
        tokens = self.tokenizer.encode(text)
        tokens = tokens + [0] * (self.target_len - len(tokens)) \
                               if len(tokens) < self.target_len \
                               else tokens[0:self.target_len]
        token_input = np.expand_dims(np.array(tokens), axis=0)
        segment_input = np.zeros_like(token_input)
        memory_length_input = np.zeros((1, 1)) # np.array([[self.memory_len]]) # np.zeros((1, 1))
        masks = [1] * len(tokens) + ([0] * (self.target_len - len(tokens))
                                                   if len(tokens) < self.target_len else [])
        mask_input = np.expand_dims(np.array(masks), axis=0)
        if self.trainable:
            return [token_input, segment_input, memory_length_input, mask_input]
        else:
            return [token_input, segment_input, memory_length_input]
Example #20
0
class BertEmbedding(BaseEmbedding):
    def __init__(self, hyper_parameters):
        self.layer_indexes = hyper_parameters['embedding'].get('layer_indexes', [12])
        super().__init__(hyper_parameters)

    def build(self):
        import keras_bert

        self.embedding_type = 'bert'
        config_path = os.path.join(self.corpus_path, 'bert_config.json')
        check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        print('load bert model start!')
        model = keras_bert.load_trained_model_from_checkpoint(config_path,
                                                              check_point_path,
                                                              seq_len=self.len_max,
                                                              trainable=self.trainable)
        print('load bert model end!')
        # bert model all layers
        layer_dict = [6]
        layer_0 = 7
        for i in range(12):
            layer_0 = layer_0 + 8
            layer_dict.append(layer_0)
        print(layer_dict)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in [i + 1 for i in range(13)]:
                encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)]
                          else model.get_layer(index=layer_dict[-1]).output  # 如果给出不正确,就默认输出最后一层
                          for lay in self.layer_indexes]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(self.input, self.output)

        self.embedding_size = self.model.output_shape[-1]
        # word2idx = {}
        # with open(dict_path, 'r', encoding='utf-8') as f:
        #     words = f.read().splitlines()
        # for idx, word in enumerate(words):
        #     word2idx[word] = idx
        # for key, value in self.ot_dict.items():
        #     word2idx[key] = value
        #
        # self.token2idx = word2idx

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)

    def build_keras4bert(self):
        import bert4keras
        from bert4keras.models import build_transformer_model
        from bert4keras.tokenizers import Tokenizer,load_vocab
        import os
        self.embedding_type = 'bert'
        config_path = os.path.join(self.corpus_path, 'bert_config.json')
        checkpoint_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        self.model = bert4keras.models.build_transformer_model(config_path=config_path,
                                                               checkpoint_path=checkpoint_path)

        # 加载并精简词表,建立分词器
        self.token_dict, keep_tokens = load_vocab(
            dict_path=dict_path,
            simplified=True,
            startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
        )
        self.vocab_size = len(self.token_dict)
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)

    def sentence2idx(self, text, second_text=None):
        text = extract_chinese(str(text).upper())
        text = str(text).upper()
        input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
        return [input_id, input_type_id]
class Bert4KearsBase(BaseModel):
    def __init__(self, config):
        # config_path,checkpoint_path,dict_path
        '''
        config = {"config_path":,"checkpoint_path":,"save_dir":,"dict_path":}
        '''
        super().__init__(config)
        init_dir(self.save_dir)
        self.tokenizer = Tokenizer(self.config['dict_path'],
                                   do_lower_case=True)
        self.graph = tf.get_default_graph()
        self.model_name = None
        self.best_weights_path = None
        self.model_path = None

    def optimizer(self):
        AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
        _optimizer = AdamLR(lr=1e-5, lr_schedule={1000: 1, 2000: 0.1})
        return _optimizer

    def _init_model(self):
        # 加载预训练模型
        bert = build_transformer_model(
            config_path=self.config['config_path'],
            checkpoint_path=self.config['checkpoint_path'],
            model=self.model_name,
            return_keras_model=False,
        )
        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
        output = Dense(units=self.num_labels,
                       activation='softmax',
                       kernel_initializer=bert.initializer)(output)
        model = keras.models.Model(bert.model.input, output)
        return model

    def _load_data(self, path):
        df = load_df(path)
        D = []
        for text, label in zip(df['text'], df['label']):
            D.append((str(text), int(label)))
        return D

    def process_data(self, train_path, dev_path, test_path):
        train_data = self._load_data(train_path)
        dev_data = self._load_data(dev_path)
        test_data = self._load_data(test_path)

        train_generator = data_generator(train_data, self.tokenizer,
                                         self.max_len, self.batch_size)
        dev_generator = data_generator(dev_data, self.tokenizer, self.max_len,
                                       self.batch_size)
        test_generator = data_generator(test_data, self.tokenizer,
                                        self.max_len, self.batch_size)

        return train_generator, dev_generator, test_generator

    def train(self, train_path, dev_path, test_path):
        self.set_seed(self.seed)  # 为了可复现
        train_generator, dev_generator, test_generator = self.process_data(
            train_path, dev_path, test_path)
        # load model
        with self.graph.as_default():
            self.model = self._init_model()
            _optimizer = self.optimizer()
            self.model.compile(
                loss='sparse_categorical_crossentropy',
                optimizer=_optimizer,
                metrics=['accuracy'],
            )
            # start train
            early_stopping_monitor = EarlyStopping(patience=self.patience,
                                                   verbose=1)
            checkpoint = ModelCheckpoint(self.best_weights_path,
                                         monitor='val_loss',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='auto',
                                         period=1)
            callbacks = [early_stopping_monitor, checkpoint]

            self.model.fit_generator(train_generator.forfit(),
                                     steps_per_epoch=len(train_generator),
                                     validation_data=dev_generator.forfit(),
                                     validation_steps=len(dev_generator),
                                     epochs=self.epochs,
                                     callbacks=callbacks)

            self.model.load_weights(self.best_weights_path)
            self.model.save(self.model_path)
        model_report = self.evaluate(test_path)
        return model_report

    def load_model(self, model_path):
        self.model = keras.models.load_model(model_path,
                                             custom_objects=custom_objects)

    def demo(self, text):
        text_list = [text]
        pred_list = self.demo_text_list(text_list)
        pred = pred_list[0]
        return pred

    def demo_text_list(self, text_list):
        batch_token_ids, batch_segment_ids = [], []
        for text in text_list:
            token_ids, segment_ids = self.tokenizer.encode(
                text, max_length=self.max_len)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
        batch_token_ids = sequence_padding(batch_token_ids)
        batch_segment_ids = sequence_padding(batch_segment_ids)
        with self.graph.as_default():
            preds = self.model.predict([batch_token_ids, batch_segment_ids])
        if self.num_labels == 2:
            pred_list = preds[:, 1]
        else:
            pred_list = np.argmax(preds, axis=1).flatten()
        return pred_list

    def release(self):
        # K.clear_session()
        del self.model
        del self.graph
        del self.tokenizer
Example #22
0
from bert4keras.models import build_transformer_model
from roformer import RoFormerModel, RoFormerTokenizer

jieba.initialize()
config_path = 'E:/BaiduNetdiskDownload/chinese_roformer_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'E:/BaiduNetdiskDownload/chinese_roformer_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'E:/BaiduNetdiskDownload/chinese_roformer_L-12_H-768_A-12/vocab.txt'
# converted_ckpt_path = "pretrained_models/chinese_roformer_base"
converted_ckpt_path = "junnyu/roformer_chinese_base"  #https://huggingface.co/junnyu/roformer_chinese_base
tokenizer = Tokenizer(dict_path,
                      do_lower_case=True,
                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))
text = "这里基本保留了唐宋遗留下来的坊巷格局和大量明清古建筑,其中各级文保单位29处,被誉为“里坊制度的活化石”“明清建筑博物馆”!"

#bert4keras
inputs = tokenizer.encode(text)
tf_inputs = [
    tf.convert_to_tensor(inputs[0])[None],
    tf.convert_to_tensor(inputs[1])[None]
]
model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='roformer')
bert4keras_outputs = torch.tensor(model(tf_inputs).numpy())
# pt
roformer_tokenizer = RoFormerTokenizer.from_pretrained(converted_ckpt_path)
pt_model = RoFormerModel.from_pretrained(converted_ckpt_path,
                                         add_pooling_layer=False)
pt_inputs = roformer_tokenizer(text, return_tensors="pt")
with torch.no_grad():
    pt_outputs = pt_model(**pt_inputs).last_hidden_state
Example #23
0
class XlnetEmbedding(BaseEmbedding):
    def __init__(self, hyper_parameters):
        self.layer_indexes = hyper_parameters['embedding'].get(
            'layer_indexes', [24])
        self.xlnet_embed = hyper_parameters['embedding'].get('xlnet_embed', {})
        self.batch_size = hyper_parameters['model'].get('batch_size', 2)
        super().__init__(hyper_parameters)

    def build(self):
        from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects
        from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI

        self.embedding_type = 'xlnet'
        self.checkpoint_path = os.path.join(self.corpus_path,
                                            'xlnet_model.ckpt')
        self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json')
        self.spiece_model = os.path.join(self.corpus_path, 'spiece.model')

        self.attention_type = self.xlnet_embed.get('attention_type',
                                                   'bi')  # or 'uni'
        self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI
        self.memory_len = self.xlnet_embed.get('memory_len', 0)
        self.target_len = self.xlnet_embed.get('target_len', 5)
        print('load xlnet model start!')
        # 模型加载
        model = load_trained_model_from_checkpoint(
            checkpoint_path=self.checkpoint_path,
            attention_type=self.attention_type,
            in_train_phase=self.trainable,
            config_path=self.config_path,
            memory_len=self.memory_len,
            target_len=self.target_len,
            batch_size=self.batch_size,
            mask_index=0)
        #
        set_custom_objects()
        # 字典加载
        self.tokenizer = Tokenizer(self.spiece_model)
        # debug时候查看layers
        self.model_layers = model.layers
        len_layers = self.model_layers.__len__()
        print(len_layers)

        layer_real = [i for i in range(25)] + [-i for i in range(25)]
        # 简要判别一下
        self.layer_indexes = [
            i if i in layer_real else -2 for i in self.layer_indexes
        ]

        len_couche = int((len_layers - 6) / 10)
        # 一共246个layer
        # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层
        # 一共24层
        layer_dict = []
        layer_0 = 7
        for i in range(len_couche):
            layer_0 = layer_0 + 10
            layer_dict.append(layer_0)
        layer_dict.append(247)
        # 测试 get_output_at
        # def get_number(index):
        #     try:
        #        model_node = model.get_output_at(node_index=index)
        #        gg = 0
        #     except:
        #         print('node index wrong!')
        #         print(index)
        # list_index = [i for i in range(25)] + [-i for i in range(25)]
        # for li in list_index:
        #     get_number(li)

        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,取得不正确的话就取倒数第二层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in layer_real:
                encoder_layer = model.get_layer(
                    index=layer_dict[self.layer_indexes[0]]).get_output_at(
                        node_index=0)
            else:
                encoder_layer = model.get_layer(
                    index=layer_dict[-1]).get_output_at(node_index=0)
        # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数
        else:
            # layer_indexes must be [0, 1, 2,3,......24]
            all_layers = [
                model.get_layer(index=layer_dict[lay]).get_output_at(
                    node_index=0) if lay in layer_real else model.get_layer(
                        index=layer_dict[-1]).get_output_at(
                            node_index=0)  # 如果给出不正确,就默认输出倒数第一层
                for lay in self.layer_indexes
            ]
            print(self.layer_indexes)
            print(all_layers)
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)

            # def xlnet_concat(x):
            #     x_concat = K.concatenate(x, axis=1)
            #     return x_concat
            # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers)

        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(self.input, self.output)
        print("load KerasXlnetEmbedding end")
        model.summary(132)

        self.embedding_size = self.model.output_shape[-1]
        self.vocab_size = len(self.tokenizer.sp)

    def sentence2idx(self, text, second_text=None):
        # text = extract_chinese(str(text).upper())
        text = str(text).upper()
        tokens = self.tokenizer.encode(text)
        tokens = tokens + [0] * (self.target_len - len(tokens)) \
                               if len(tokens) < self.target_len \
                               else tokens[0:self.target_len]
        token_input = np.expand_dims(np.array(tokens), axis=0)
        segment_input = np.zeros_like(token_input)
        memory_length_input = np.zeros(
            (1, 1))  # np.array([[self.memory_len]]) # np.zeros((1, 1))
        masks = [1] * len(tokens) + ([0] * (self.target_len - len(tokens))
                                     if len(tokens) < self.target_len else [])
        mask_input = np.expand_dims(np.array(masks), axis=0)
        if self.trainable:
            return [
                token_input, segment_input, memory_length_input, mask_input
            ]
        else:
            return [token_input, segment_input, memory_length_input]
Example #24
0
class MRCTrainer():
    def __init__(self, train_param, model_save_path):
        self.lr = train_param['learning_rate']
        self.max_p_len = train_param['max_p_len']
        self.max_q_len = train_param['max_q_len']
        self.max_a_len = train_param['max_a_len']
        self.epochs = train_param['epochs']
        self.pretrain_type = train_param['pretrain_type']
        self.batch_size = train_param['batch_size']

        self.config_path = train_param['config_path']
        self.checkpoint_path = train_param['checkpoint_path']
        self.dict_path = train_param['dict_path']
        self.model_config = train_param
        self.model_config['model_save_path'] = model_save_path
        self.model_save_path = model_save_path

        self.buildmodel()

    def masked_cross_entropy(self, y_true, y_pred):
        y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
        y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
        cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
        cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

        return cross_entropy

    def buildmodel(self):
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
        )
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)

        if self.pretrain_type == 'albert':
            model = build_transformer_model(
                config_path,
                checkpoint_path,
                model='albert',
                with_mlm=True,
                keep_tokens=self.keep_tokens,
            )
        elif self.pretrain_type == 'bert':
            model = build_transformer_model(
                config_path,
                checkpoint_path,
                model='bert',
                with_mlm=True,
                keep_tokens=self.keep_tokens,
            )
        output = Lambda(lambda x: x[:, 1:self.max_a_len + 1])(model.output)
        #print(output.shape)
        self.model = Model(model.input, output)
        self.model.compile(loss=self.masked_cross_entropy,
                           optimizer=Adam(self.lr))
        self.model.summary()

    def fit(self, train_data):

        params_file = os.path.join(self.model_save_path, 'config.json')
        with open(params_file, 'w', encoding='utf-8') as json_file:
            json.dump(self.model_config,
                      json_file,
                      indent=4,
                      ensure_ascii=False)

        evaluator = Evaluator(self.model, self.model_save_path)
        train_generator = data_generator(train_data, self.tokenizer,
                                         self.batch_size, self.max_a_len,
                                         self.max_q_len, self.max_p_len)

        self.model.fit_generator(train_generator.forfit(),
                                 steps_per_epoch=len(train_generator),
                                 epochs=epochs,
                                 callbacks=[evaluator])

    def get_ngram_set(self, x, n):
        """生成ngram合集,返回结果格式是:
        {(n-1)-gram: set([n-gram的第n个字集合])}
        """
        result = {}
        for i in range(len(x) - n + 1):
            k = tuple(x[i:i + n])
            if k[:-1] not in result:
                result[k[:-1]] = set()
            result[k[:-1]].add(k[-1])
        return result

    def gen_answer(self, question, passage):

        token_ids, segment_ids = [], []
        passage = re.sub(u' |、|;|,', ',', passage)
        p_token_ids, _ = self.tokenizer.encode(passage,
                                               max_length=self.max_p_len + 1)
        q_token_ids, _ = self.tokenizer.encode(question,
                                               max_length=self.max_q_len + 1)
        token_ids = [self.tokenizer._token_start_id]
        token_ids += [self.tokenizer._token_mask_id] * max_a_len
        token_ids += [self.tokenizer._token_end_id]
        token_ids += q_token_ids[1:] + p_token_ids[1:]
        segment_ids = [0] * len(token_ids[-1])
        token_ids = sequence_padding(token_ids)
        segment_ids = sequence_padding(segment_ids)
        probas = self.model.predict([token_ids, segment_ids])
        results = {}
        a, score = tuple(), 0.
        for i in range(max_a_len):
            idxs = list(self.get_ngram_set(token_ids, i + 1)[a])
            print("idxs", idxs)
            if self.tokenizer._token_end_id not in idxs:
                idxs.append(self.tokenizer._token_end_id)
            pi = np.zeros_like(probas[i])
            pi[idxs] = probas[i, idxs]
            a = a + (pi.argmax(), )
            score += pi.max()
            if a[-1] == self.tokenizer._token_end_id:
                break
        score = score / (i + 1)
        a = self.tokenizer.decode(a)
        if a:
            results[a] = results.get(a, []) + [score]
        results = {
            k: (np.array(v)**2).sum() / (sum(v) + 1)
            for k, v in results.items()
        }
        return results

    def evalue(self):
        result = []
        return result
Example #25
0
# 测试代码可用性: 提取特征

from bert4keras.backend import keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
import numpy as np

config_path = '../models/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../models//bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../models//bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'阅读理解')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
    0.39056838]
)  # 建立模型,加载权重

sentences = []
init_sent = u'科学技术是第一生产力。'  # 给定句子或者None
minlen, maxlen = 8, 32
steps = 10000
converged_steps = 1000
vocab_size = tokenizer._vocab_size

if init_sent is None:
    length = np.random.randint(minlen, maxlen + 1)
    tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]']
    token_ids = tokenizer.tokens_to_ids(tokens)
    segment_ids = [0] * len(token_ids)
else:
    token_ids, segment_ids = tokenizer.encode(init_sent)
    length = len(token_ids) - 2

for _ in tqdm(range(steps), desc='Sampling'):
    # Gibbs采样流程:随机mask掉一个token,然后通过MLM模型重新采样这个token。
    i = np.random.choice(length) + 1
    token_ids[i] = tokenizer._token_mask_id
    probas = model.predict(to_array([token_ids], [segment_ids]))[0, i]
    token = np.random.choice(vocab_size, p=probas)
    token_ids[i] = token
    sentences.append(tokenizer.decode(token_ids))

print(u'部分随机采样结果:')
for _ in range(10):
    print(np.random.choice(sentences[converged_steps:]))
Example #27
0
import numpy as np
from bert4keras.backend import keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import to_array

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_transformer_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
token_ids, segment_ids = to_array([token_ids], [segment_ids])

print('\n ===== predicting =====\n')
print(model.predict([token_ids, segment_ids]))
"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
Example #28
0
class simBERT(object):
    '''基于BERT的语义计算引擎'''
    def __init__(self,
                 config=bert_config_path,
                 checkpoint=bert_checkpoint_path,
                 dicts=bert_dict_path):

        from bert4keras.backend import keras
        from bert4keras.tokenizers import Tokenizer
        from bert4keras.snippets import sequence_padding
        from bert4keras.models import build_transformer_model

        self.config_path = config
        self.checkpoint_path = checkpoint
        self.dict_path = dicts
        self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True)
        self.sequence_padding = sequence_padding
        self.bert = build_transformer_model(
            self.config_path,
            self.checkpoint_path,
            with_pool='linear',
            application='unlim',
            return_keras_model=False,
        )
        self.encoder = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[0])
        # self.seq2seq = keras.models.Model(self.bert.model.inputs,self.bert.model.outputs[1])

    def sent2vec(self, sent):
        # -------------------------------------------------
        #    description: 句子转向量
        #    param sent str,输入句子
        #    return:
        # -------------------------------------------------
        if isinstance(sent, list):
            X, S = [], []
            for s in sent:
                x, s = self.tokenizer.encode(s)
                X.append(x)
                S.append(s)
            X = self.sequence_padding(X)
            S = self.sequence_padding(S)
            # Z = self.encoder.predict([X,S])
        else:
            x, s = self.tokenizer.encode(sent)
            X = self.sequence_padding([x])
            S = self.sequence_padding([s])
        Z = self.encoder.predict([X, S], verbose=1)
        # 将向量归一化,便于计算各类距离
        return normalize(Z)

    def keywords(self, token=None, text='', topn=1, with_sim=True):
        # -------------------------------------------------
        #    description: 关键词匹配算法,当有token时,返回token中和句子相似度最大的词语;当无token时,返回句中关键词
        #    param token list,待选词表,可为空
        #    param text str,输入文本
        #    param topn int,默认为1,最大不超过token词表的最大长度
        #    param with_sim boolean 当为True时,返回带相似度的结果
        #    return:
        # -------------------------------------------------
        if token is not None:
            r = token + [text]
            # r = token + [c for c in cut(text) if len(c) > 1]
        else:
            token = [c for c in cut(text) if len(c) > 1]
            r = token + token
        X, S = [], []
        for t in r:
            x, s = self.tokenizer.encode(t)
            X.append(x)
            S.append(s)
        X = self.sequence_padding(X)
        S = self.sequence_padding(S)
        Z = normalize(self.encoder.predict([X, S]))
        score = np.dot(Z[len(token):], Z[:len(token)].T)
        # print(score.shape)
        if with_sim:
            return [(token[i], score[0][i]) for i in topK(score, topn)[1]]
        return np.array(token)[topK(score, topn)[1]]

    def sentence_similarity(self, sent_1, sent_2):
        # -------------------------------------------------
        #    description: 句子相似度计算
        #    param sent_1 str,输入语句
        #    param sent_2 str,输入语句
        #    return:
        # -------------------------------------------------
        sent_vec_1 = self.sent2vec(sent_1)
        sent_vec_2 = self.sent2vec(sent_2)
        similarity = np.dot(sent_vec_1, sent_vec_2.T)
        return similarity[0][0]
Example #29
0
            D.append((text1, text2, int(label)))
    return D


# 加载数据集
train_data = load_data('datasets/lcqmc/lcqmc.train.data')
valid_data = load_data('datasets/lcqmc/lcqmc.valid.data')
test_data = load_data('datasets/lcqmc/lcqmc.test.data')

# 测试相似度效果
data = valid_data
a_token_ids, b_token_ids, labels = [], [], []
texts = []

for d in data:
    token_ids = tokenizer.encode(d[0], max_length=maxlen)[0]
    a_token_ids.append(token_ids)
    token_ids = tokenizer.encode(d[1], max_length=maxlen)[0]
    b_token_ids.append(token_ids)
    labels.append(d[2])
    texts.extend(d[:2])

a_token_ids = sequence_padding(a_token_ids)
b_token_ids = sequence_padding(b_token_ids)
a_vecs = encoder.predict([a_token_ids, np.zeros_like(a_token_ids)],
                         verbose=True)
b_vecs = encoder.predict([b_token_ids, np.zeros_like(b_token_ids)],
                         verbose=True)
labels = np.array(labels)

a_vecs = a_vecs / (a_vecs**2).sum(axis=1, keepdims=True)**0.5
output = Dense(units=2,
               activation='softmax',
               kernel_initializer=bert.initializer)(output)
model = keras.models.Model(bert.model.input, output)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(config['learning_rate']),
              metrics=['sparse_categorical_accuracy'])

model.load_weights('Disaster_Rumor_Detection_best_model_1_0.weights')

# Tokenizer
tokenizer = Tokenizer(config['dict_path'], do_lower_case=True)

# Make predictions
table = []
for idx, row in df_test.iterrows():
    token_ids, seg_ids = tokenizer.encode(row['text'],
                                          maxlen=config['max_len'])
    result = model.predict([[token_ids], [seg_ids]]).argmax(axis=1)
    table.append([row['id'], result[0]])
    print('Data id{} prediction done!'.format(row['id']))
    print('And result is {}'.format(result[0]))
    print('-' * 60)

final_result = pd.DataFrame(table, columns=['id', 'target'])

if __name__ == '__main__':
    print(final_result.head())
    final_result.to_csv('mysubmission.csv', index=False)