Ejemplo n.º 1
0
class ALBertEmbedding(ModelBase):
    '''通过ALBert计算句向量
    '''
    def __init__(self, 
                       config_path=const.ALBERT_CONFIG_PATH, 
                       albert_checkpoint_path = const.ALBERT_CHECKPOINT_PATH, 
                       dict_path = const.ALBERT_DICT_PATH,
                       train_mode=False ):
        self.session = tf.Session() 
        keras.backend.set_session(self.session)
        if train_mode:
            self.bert = build_bert_model(
                         model='albert', 
                         config_path=config_path,
                         checkpoint_path=albert_checkpoint_path,
                         with_pool=True,
                         return_keras_model=False,)
        else:
            self.bert = build_bert_model(
                         model='albert', 
                         config_path=config_path,
                         # checkpoint_path=albert_checkpoint_path,
                         with_pool=True,
                         return_keras_model=False,)
            self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0])
            self.tokenizer = Tokenizer(dict_path, do_lower_case=True) 
            self.encoder.load_weights(albert_checkpoint_path, by_name=True)
    
    def init(self, words_list=None, update=True):
        if words_list!=None:
            token_ids_list, segment_ids_list = [], []
            for words in words_list:
                token_ids, segment_ids = self.tokenizer.encode(words)
                token_ids_list.append(token_ids)
                segment_ids_list.append(segment_ids)
            token_ids_list = sequence_padding(token_ids_list)
            segment_ids_list = sequence_padding(segment_ids_list)
            self.words_list_pre = self.encoder.predict([token_ids_list, segment_ids_list])
            self.words_list_pre = self._normalize(self.words_list_pre)
        return self
    
    def _predict(self, words):
        with self.session.as_default():
            with self.session.graph.as_default():
                token_ids, segment_ids = self.tokenizer.encode( words )
                pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])])
                pre = self._normalize(pre)
        return pre
        
    # 句向量 
    def predict(self, words):
        with self.session.as_default():
            with self.session.graph.as_default():
                token_ids, segment_ids = self.tokenizer.encode( words )
                pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])])
                pre = self._normalize(pre)
        return np.dot( self.words_list_pre[:], pre[0] ) 
class SiameseDataGenerator(DataGenerator):
    """
    SiameseBert的数据生成器,生成的数据组成为:
    """
    def __init__(self, data_path: str, batch_size: int, maxlen: int,
                 dict_path: str):
        super().__init__(data=self.__load_data(data_path),
                         batch_size=batch_size)
        self._tokenizer = Tokenizer(dict_path, do_lower_case=True)
        self._maxlen = maxlen

    @staticmethod
    def __load_data(filename: str):
        D = []
        with open(filename, encoding='utf-8') as f:
            for line in f:
                category, text1, text2, label = line.strip().split(',')
                if category != 'category':
                    # 过滤掉columns数据行
                    D.append((text1, text2, int(label)))
        return D

    def __iter__(self, random=False):
        idxs = list(range(len(self.data)))
        if random:
            np.random.shuffle(idxs)
        q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \
        batch_labels = [], [], [], [], []
        for i in idxs:
            text1, text2, label = self.data[i]
            q1_token_ids, q1_segment_ids = self._tokenizer.encode(
                text1, max_length=self._maxlen)
            q2_token_ids, q2_segment_ids = self._tokenizer.encode(
                text2, max_length=self._maxlen)

            q1_batch_token_ids.append(q1_token_ids)
            q2_batch_token_ids.append(q2_token_ids)
            q1_batch_segment_ids.append(q1_segment_ids)
            q2_batch_segment_ids.append(q2_segment_ids)
            batch_labels.append([label])

            if len(batch_labels) == self.batch_size or i == idxs[-1]:
                q1_batch_token_ids = sequence_padding(q1_batch_token_ids)
                q2_batch_token_ids = sequence_padding(q2_batch_token_ids)

                q1_batch_segment_ids = sequence_padding(q1_batch_segment_ids)
                q2_batch_segment_ids = sequence_padding(q2_batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)

                yield [
                    q1_batch_token_ids, q1_batch_segment_ids,
                    q2_batch_token_ids, q2_batch_segment_ids
                ], batch_labels

                q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \
                batch_labels = [], [], [], [], []
Ejemplo n.º 3
0

config_path = args.config_path
checkpoint_path = args.checkpoint_path
dict_path = args.dict_path

min_count = 0
max_input_len = args.max_input_len
max_output_len = args.max_output_len
batch_size = args.batch_size
epochs = args.epochs
topk = args.topk

train_data_path = args.train_data_path
val_data_path = args.val_data_path

token_dict = load_vocab(dict_path)  # 读取词典

tokenizer = Tokenizer(token_dict, do_lower_case=True)  # 建立分词器

sep_id = tokenizer.encode('')[0][-1]

rouge = Rouge()
model = get_model(config_path, checkpoint_path, args.albert, args.lr)

evaluator = Evaluate(val_data_path, topk)

model.fit_generator(DataGenerator(train_data_path, batch_size),
                    epochs=epochs,
                    callbacks=[evaluator])
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.bert import build_bert_model
from bert4keras.tokenizer import Tokenizer
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_bert_model(config_path, checkpoint_path,
                         with_mlm=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
Ejemplo n.º 5
0
from bert4keras.backend import keras
from bert4keras.bert import build_bert_model
from bert4keras.tokenizer import Tokenizer
import numpy as np


config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path) # 建立分词器
model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
    0.5369075 ]
  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
Ejemplo n.º 6
0
samples = len(all_data)
train_samples = int(samples * TRAIN_SPLIT)
dev_samples = int(samples * DEV_SPLIT)
train_data, train_labels = all_data[:train_samples], all_labels[:train_samples]
dev_data, dev_labels = all_data[train_samples:train_samples +
                                dev_samples], all_labels[
                                    train_samples:train_samples + dev_samples]
test_data, test_labels = all_data[train_samples +
                                  dev_samples:], all_labels[train_samples +
                                                            dev_samples:]

# 加载预训练模型的词典
_token_dict = load_vocab(DICT_PATH)
_tokenizer = Tokenizer(_token_dict, do_lower_case=True)
print(all_data[0])
print(_tokenizer.encode(all_data[0]))
print(_tokenizer.tokenize(all_data[0]))
print([_tokenizer.id_to_token(21934)])
print(_tokenizer.token_to_id('[PAD]'))

# 统计数据集中的词频
counter = Counter()
for line in all_data:
    _tokens = _tokenizer.tokenize(line)
    # 统计词频时,移除[CLS]和[SEP]字符
    counter.update(_tokens[1:-1])
print(len(counter))
# 移除词频较低的词
_tokens = [
    token for token, cnt in counter.items() if cnt >= MIN_WORD_FREQUENCY
]