def albert_process_data(self, mode='part'):
     _token_dict = load_vocab(self.albert_dict_path)  # 读取字典
     # 只取涉及数据集中出现的字
     if mode == 'part':
         train_df = pd.read_csv(self.train_data_path,
                                names=['seq1', 'seq2', 'label'])
         valid_df = pd.read_csv(self.dev_data_path,
                                names=['seq1', 'seq2', 'label'])
         test_df = pd.read_csv(self.test_data_path,
                               names=['seq1', 'seq2', 'label'])
         # total data
         tmp_df = pd.concat([train_df, valid_df, test_df])
         chars = defaultdict(int)
         for _, tmp_row in tmp_df.iterrows():
             for tmp_char in tmp_row.seq1:
                 chars[tmp_char] += 1
             for tmp_char in tmp_row.seq2:
                 chars[tmp_char] += 1
         # 过滤低频字
         chars = {i: j for i, j in chars.items() if j >= 4}
         self.token_dict, self.keep_words = {}, []  # keep_words是在bert中保留的字表
         # 保留特殊字符
         for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
             self.token_dict[c] = len(self.token_dict)
             self.keep_words.append(_token_dict[c])
         # 字典只保留数据中出现的高频字
         for c in chars:
             if c in _token_dict:
                 self.token_dict[c] = len(self.token_dict)
                 self.keep_words.append(_token_dict[c])
     elif mode == 'full':
         self.token_dict, self.keep_words = _token_dict, []
         for k in self.token_dict:
             self.keep_words.append(self.token_dict[k])
     self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器
Beispiel #2
0
def process_data(train_file, dev_file, test_file):
    chars = set()

    train_datas = read_data(train_file)
    dev_datas = read_data(dev_file)
    test_datas = read_data(test_file)
    for text1, text2, label in train_datas + dev_datas:
        chars.update(set(text1))
        chars.update(set(text2))

    _token_dict = load_vocab(dict_path)  # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])

    tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    return train_datas, dev_datas, test_datas, tokenizer, keep_words
Beispiel #3
0
    def save_vocab(self, input_data, incremental_train=False):
        relationships = set()
        chars = set()
        for (text, triple), (entity_lists, rel) in input_data:
            chars.update(set(text))
            relationships.add(rel)
            relationships.update(set(p for s, p, o in triple))

        token_dict = load_vocab(dict_path)  # 读取词典

        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict) # 建立分词器

        # keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']

        rel2id = {rel: _id + 1 for _id, rel in enumerate(sorted(relationships))}
        rel2id['unk'] = 0

        if not incremental_train:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
                pickle.dump(tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
                pickle.dump(keep_words, f)

            with open(os.path.join(model_save_path, 'rel2id.pkl'), "wb") as f:
                pickle.dump(rel2id, f)

        self.tokenizer, self.keep_words, self.rel2id = tokenizer, keep_words, rel2id
        return tokenizer, keep_words, rel2id
Beispiel #4
0
    def __init__(self,
                 initial_model=True,
                 model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')):
        self.initial_model = initial_model
        token_dict = load_vocab(DICT_PATH)
        self.tokenizer = SimpleTokenizer(token_dict)
        self.model_path = model_path
        if initial_model:
            self.albert_model = load_pretrained_model(
                CONFIG_PATH,
                CHECKPOINT_PATH,
                # keep_words=keep_words,
                albert=True)
        else:
            self.load(model_path)

        for l in self.albert_model.layers:
            l.trainable = True
Beispiel #5
0
    def __init__(self, batch_size=32, train=False):
        self.batch_size = batch_size
        if train:
            chars = set()
            train_datas = read_datas(TRAIN_DATA_FILE)
            dev_datas = read_datas(DEV_DATA_FILE)
            test_datas = read_datas(TEST_DATA_FILE)
            for text1, text2, label in itertools.chain(train_datas, dev_datas):
                chars.update(set(text1))
                chars.update(set(text2))

            _token_dict = load_vocab(dict_path)  # 读取词典
            self.token_dict, self.keep_words = {}, []

            for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
                self.token_dict[c] = len(self.token_dict)
                self.keep_words.append(_token_dict[c])

            for c in chars:
                if c in _token_dict:
                    self.token_dict[c] = len(self.token_dict)
                    self.keep_words.append(_token_dict[c])

            self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器

            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "wb") as f:
                pickle.dump(self.tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "wb") as f:
                pickle.dump(self.keep_words, f)

        else:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "rb") as f:
                self.tokenizer = pickle.load(f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "rb") as f:
                self.keep_words = pickle.load(f)

        self.model = self.make_model()
Beispiel #6
0
def get_correct_fn():
    save_path = join(MODEL_PATH, 'detect')

    token_dict = joblib.load(
        join(MODEL_PATH, 'train_pre_for_error_detect', 'token_dict.joblib'))
    tokenizer = SimpleTokenizer(token_dict)
    keep_words = joblib.load(
        join(MODEL_PATH, 'train_pre_for_error_detect', 'keep_words.joblib'))
    model = DetectModel(keep_words=keep_words)
    model.compile()
    model.model.load_weights(join(save_path, 'weights.hdf5'))

    checker = Statistics()

    def correct(error_text):
        text_tokens = tokenizer.tokenize(error_text, False,
                                         False)[:ec_cfg.max_seq_len - 2]
        tokens = list()
        tokens.append("[CLS]")
        for token in text_tokens:
            tokens.append(token)
        tokens.append("[SEP]")
        input_ids = [
            token_dict[c] if c in token_dict.keys() else token_dict['[UNK]']
            for c in tokens
        ]
        while len(input_ids) < ec_cfg.max_seq_len:
            input_ids.append(0)

        seg_ids = np.zeros_like(input_ids, dtype=np.int)

        ids, segs = [input_ids], [seg_ids]
        res = model.model.predict([ids, segs])[0][1:-1]

        begins_pred = []
        lengths_pred = []
        this_len = 0
        for i, r in enumerate(res):
            if np.argmax(r) > 0:
                if this_len == 0:
                    begins_pred.append(i)
                this_len += 1
            else:
                if this_len > 0:
                    lengths_pred.append(this_len)
                    this_len = 0
        else:
            if this_len > 0:
                lengths_pred.append(this_len)

        res_str = checker.correct(error_text, begins_pred, lengths_pred)

        return res_str

    return correct
def process_data(neg_file='datasets/neg.xls', pos_file='datasets/pos.xls'):
    neg = pd.read_excel(neg_file, header=None)
    pos = pd.read_excel(pos_file, header=None)
    chars = {}

    data = []

    for d in neg[0]:
        data.append((d, 0))
        for c in d:
            chars[c] = chars.get(c, 0) + 1

    for d in pos[0]:
        data.append((d, 1))
        for c in d:
            chars[c] = chars.get(c, 0) + 1

    chars = {i: j for i, j in chars.items() if j >= 4}

    _token_dict = load_vocab(dict_path)  # 读取词典
    token_dict, keep_words = {}, set()

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.add(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.add(_token_dict[c])

    keep_words.add(max(keep_words) + 1)
    keep_words = list(keep_words)

    tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    if not os.path.exists('./random_order.json'):
        random_order = [i for i in range(len(data))]
        random.shuffle(random_order)
        json.dump(random_order, open('./random_order.json', 'w'), indent=4)
    else:
        random_order = json.load(open('./random_order.json'))

    # 按照9:1的比例划分训练集和验证集
    train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

    return train_data, valid_data, tokenizer, keep_words
    def save_vocab(self, save_path, process_data):
        chars = set()
        relationships = set()
        for text, relationship in process_data:
            words = split_text(text)
            chars.update(set(words))
            relationships.add(relationship)

        token_dict = load_vocab(dict_path)  # 读取词典
        keep_chars = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']

        for char in chars:
            if not token_dict.get(char):
                # token_dict[char] = len(token_dict)
                keep_chars.append(char)

        # for char in keep_chars:
        #     if not token_dict.get(char):
        #         token_dict[char] = len(token_dict)

        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

        word2id = {
            word: id_ + len(keep_chars)
            for id_, word in enumerate(chars)
        }
        for _id, word in enumerate(keep_chars):
            word2id[word] = _id

        rel2id = {rel: _id for _id, rel in enumerate(relationships)}

        with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
            pickle.dump(tokenizer, f)

        with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
            pickle.dump(keep_words, f)

        with open(os.path.join(save_path, 'word2id.pkl'), "wb") as f:
            pickle.dump(word2id, f)
        with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f:
            pickle.dump(rel2id, f)

        self.tokenizer, self.word2id, self.rel2id = tokenizer, word2id, rel2id

        return tokenizer, keep_words, word2id, rel2id
    def save_vocab(self, save_path, process_data):
        flags = set()
        relationships = set()
        for old_word_flag, relationship in process_data:
            word_flag = []
            for word, flag in old_word_flag:
                # if flag[0] == 'B':
                #     flag = 'B-Shiyi'
                # elif flag[0] == 'I':
                #     flag = 'I-Shiyi'
                word_flag.append([word, flag])
            flags.update(set(flag for word, flag in word_flag))
            relationships.add(relationship)

        token_dict = load_vocab(dict_path)  # 读取词典

        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

        keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']
        flag2id = {
            label: id_ + len(keep_flags)
            for id_, label in enumerate(
                sorted(flags, key=lambda x: 0 if x == 'O' else 1))
        }
        for flag_id, flag in enumerate(keep_flags):
            flag2id[flag] = flag_id

        rel2id = {rel: _id for _id, rel in enumerate(relationships)}

        with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
            pickle.dump(tokenizer, f)

        with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
            pickle.dump(keep_words, f)

        with open(os.path.join(save_path, 'flag2id.pkl'), "wb") as f:
            pickle.dump(flag2id, f)
        with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f:
            pickle.dump(rel2id, f)

        self.tokenizer, self.flag2id, self.rel2id = tokenizer, flag2id, rel2id

        return tokenizer, keep_words, flag2id, rel2id
Beispiel #10
0
    def save_vocab(self, model_save_path, process_data):
        chars = set()
        labels = set()
        for char_labels in process_data:
            for char, label in char_labels:
                chars.add(char)
                labels.add(label)

        _token_dict = load_vocab(dict_path)  # 读取词典
        token_dict, keep_words = {}, set()

        for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
            token_dict[c] = len(token_dict)
            keep_words.add(_token_dict[c])

        for c in chars:
            if c in _token_dict:
                token_dict[c] = len(token_dict)
                keep_words.add(_token_dict[c])

        keep_words.add(max(keep_words) + 1)
        keep_words = list(keep_words)
        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

        with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
            pickle.dump(tokenizer, f)

        with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
            pickle.dump(keep_words, f)

        # print('labels={}'.format(labels))
        # sorted: 保证 非实体词 O 的id为0
        self.label2id = {
            label: id_
            for id_, label in enumerate(
                sorted(labels, key=lambda x: 0 if x == 'O' else 1))
        }
        print('label2id: {}'.format(self.label2id))
        with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f:
            pickle.dump(self.label2id, f)

        self.keep_words = keep_words
        self.tokenizer = tokenizer
Beispiel #11
0
    def save_word2id_etc(self, datas, incremental_train=False):

        label_set = set()

        _token_dict = load_vocab(dict_path)  # 读取词典
        # token_dict, keep_words = {}, set()
        token_dict = copy.deepcopy(_token_dict)
        # for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        #     token_dict[c] = len(token_dict)
        #     keep_words.add(_token_dict[c])

        for chars, label in datas:
            label_set.add(label)
            # for c in chars:
            #     if c in _token_dict:
            #         token_dict[c] = len(token_dict)
            #         keep_words.add(_token_dict[c])

        # keep_words.add(max(keep_words)+1)
        # keep_words = list(keep_words)
        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
        label2id = {lab: i for i, lab in enumerate(list(label_set))}

        if not incremental_train:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "wb") as f:
                pickle.dump(tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "wb") as f:
                pickle.dump(keep_words, f)

            with open(os.path.join(model_save_path, 'label2id.pkl'),
                      "wb") as f:
                pickle.dump(label2id, f)

        return tokenizer, keep_words, label2id
Beispiel #12
0
import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')

for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

print(tf.__version__)

base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\'
config_path = base_path + 'bert_config.json'
checkpoint_path = base_path + 'bert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
Beispiel #13
0
import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])

base_path = 'D:\AI\Data\\albert_large_zh\\'
config_path = base_path + 'albert_config_large.json'
checkpoint_path = base_path + 'albert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              albert=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”权
class Albert(object):
    def __init__(self,
                 mode='inference',
                 mode_='part',
                 model_name=None,
                 dataset_name=None):
        self.maxlen = 32
        self.albert_config_path = '/Data/public/Bert/albert_tiny_489k/albert_config_tiny.json'
        self.albert_checkpoint_path = '/Data/public/Bert/albert_tiny_489k/albert_model.ckpt'
        self.albert_dict_path = '/Data/public/Bert/albert_tiny_489k/vocab.txt'
        self.train_data_path = 'data/train_{}.csv'.format(dataset_name)
        self.dev_data_path = 'data/dev_{}.csv'.format(dataset_name)
        self.test_data_path = 'data/test_{}.csv'.format(dataset_name)
        # albert_tiny_250k.h5 挺好的
        # self.restore_model_path = 'saved_models/test_albert_tiny_{}.h5'.format(model_name)
        self.restore_model_path = '/Data/models/{}'.format(model_name)

        # albert
        self.albert_process_data(mode_)
        if mode == 'train':
            self.model = self._get_model()
            self.train()
        elif mode == 'inference':
            self._init_model()

    # todo keep words 工业场景下需要remove
    def albert_process_data(self, mode='part'):
        _token_dict = load_vocab(self.albert_dict_path)  # 读取字典
        # 只取涉及数据集中出现的字
        if mode == 'part':
            train_df = pd.read_csv(self.train_data_path,
                                   names=['seq1', 'seq2', 'label'])
            valid_df = pd.read_csv(self.dev_data_path,
                                   names=['seq1', 'seq2', 'label'])
            test_df = pd.read_csv(self.test_data_path,
                                  names=['seq1', 'seq2', 'label'])
            # total data
            tmp_df = pd.concat([train_df, valid_df, test_df])
            chars = defaultdict(int)
            for _, tmp_row in tmp_df.iterrows():
                for tmp_char in tmp_row.seq1:
                    chars[tmp_char] += 1
                for tmp_char in tmp_row.seq2:
                    chars[tmp_char] += 1
            # 过滤低频字
            chars = {i: j for i, j in chars.items() if j >= 4}
            self.token_dict, self.keep_words = {}, []  # keep_words是在bert中保留的字表
            # 保留特殊字符
            for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
                self.token_dict[c] = len(self.token_dict)
                self.keep_words.append(_token_dict[c])
            # 字典只保留数据中出现的高频字
            for c in chars:
                if c in _token_dict:
                    self.token_dict[c] = len(self.token_dict)
                    self.keep_words.append(_token_dict[c])
        elif mode == 'full':
            self.token_dict, self.keep_words = _token_dict, []
            for k in self.token_dict:
                self.keep_words.append(self.token_dict[k])
        self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器

    # data pre-processing operation
    def _data_preprocessing(self, sentence1, sentence2):
        X1, X2 = [], []
        for tmp_sent1, tmp_sent2 in zip(sentence1, sentence2):
            x1, x2 = self.tokenizer.encode(first=tmp_sent1[:self.maxlen],
                                           second=tmp_sent2[:self.maxlen])
            X1.append(x1)
            X2.append(x2)
        X1 = self._seq_padding(X1)
        X2 = self._seq_padding(X2)
        # X1 = pad_sequences(X1, maxlen=67, padding='post', truncating='post')
        # X2 = pad_sequences(X2, maxlen=67, padding='post', truncating='post')
        return X1, X2

    def _seq_padding(self, X, padding=0):
        L = [len(x) for x in X]
        ML = max(L)
        padded_sent = np.array([
            np.concatenate([x, [padding] *
                            (ML - len(x))]) if len(x) < ML else x for x in X
        ])
        return padded_sent

    # prepare data for training
    def _prepare_data(self, data_path):
        data = pd.read_csv(data_path)
        sent_1 = data['sentence1'].values
        sent_2 = data['sentence2'].values
        label = data['label'].values
        X1_pad, X2_pad = self._data_preprocessing(sent_1, sent_2)
        # X1 = np.vstack((X1_pad, X2_pad))
        # X2 = np.vstack((X2_pad, X1_pad))
        # y_train = np.hstack((label, label))
        return X1_pad, X2_pad, label

    # albert for Semantic matching, model architecture
    def _get_model(self):
        model = load_pretrained_model(
            self.albert_config_path,
            self.albert_checkpoint_path,
            keep_words=self.keep_words,  # 只保留keep_words中的字,精简原字表
            albert=True)
        output = Lambda(lambda x: x[:, 0])(model.output)
        output = Dense(1, activation='sigmoid')(output)
        model = Model(model.input, output)
        return model

    # model training operation
    def train(self):
        # train_data
        train_x1, train_x2, train_label = self._prepare_data(
            self.train_data_path)
        # dev_data
        dev_x1, dev_x2, dev_label = self._prepare_data(self.dev_data_path)
        checkpoint = ModelCheckpoint(self.restore_model_path,
                                     monitor='val_accuracy',
                                     verbose=0,
                                     save_best_only=True,
                                     save_weights_only=False)
        early_stop = EarlyStopping(monitor='val_accuracy',
                                   patience=3,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)
        self.model.compile(
            loss='binary_crossentropy',
            optimizer=Adam(1e-4),  # 用足够小的学习率
            metrics=['accuracy'])
        self.model.summary()
        self.model.fit(x=[train_x1, train_x2],
                       y=train_label,
                       batch_size=64,
                       epochs=10,
                       verbose=1,
                       callbacks=[checkpoint, early_stop],
                       validation_data=([dev_x1, dev_x2], dev_label))

    # model predict operation
    def predict(self, sentence1, sentence2):
        X1, X2 = self._data_preprocessing(sentence1, sentence2)
        y_pred = self.model.predict([X1, X2], batch_size=1024)
        return y_pred

    def test(self):
        self.model.compile(
            loss='binary_crossentropy',
            optimizer=Adam(1e-4),  # 用足够小的学习率
            metrics=['accuracy'])
        # test_data
        test_x1, test_x2, test_label = self._prepare_data(self.dev_data_path)
        test_loss, test_acc = self.model.evaluate(x=[test_x1, test_x2],
                                                  y=test_label)
        print('test loss: {}'.format(test_loss))
        print('test acc: {}'.format(test_acc))

    def _init_model(self):
        self.model = load_model(self.restore_model_path)
        sentence1 = '干嘛呢'
        sentence2 = '你是机器人'
        print('model albert loaded succeed. ({})'.format(
            self.predict([sentence1], [sentence2]).item()))
chars = {i: j for i, j in chars.items() if j >= 4}

_token_dict = load_vocab(dict_path)  # 读取词典
token_dict, keep_words = {}, []

for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
    token_dict[c] = len(token_dict)
    keep_words.append(_token_dict[c])

for c in chars:
    if c in _token_dict:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

if not os.path.exists('./random_order.json'):
    random_order = range(len(data))
    np.random.shuffle(random_order)
    json.dump(random_order, open('./random_order.json', 'w'), indent=4)
else:
    random_order = json.load(open('./random_order.json'))

# 按照9:1的比例划分训练集和验证集
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]


def seq_padding(X, padding=0):
    L = [len(x) for x in X]
Beispiel #16
0
class SemanticModel():
    def __init__(self, batch_size=32, train=False):
        self.batch_size = batch_size
        if train:
            chars = set()
            train_datas = read_datas(TRAIN_DATA_FILE)
            dev_datas = read_datas(DEV_DATA_FILE)
            test_datas = read_datas(TEST_DATA_FILE)
            for text1, text2, label in itertools.chain(train_datas, dev_datas):
                chars.update(set(text1))
                chars.update(set(text2))

            _token_dict = load_vocab(dict_path)  # 读取词典
            self.token_dict, self.keep_words = {}, []

            for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
                self.token_dict[c] = len(self.token_dict)
                self.keep_words.append(_token_dict[c])

            for c in chars:
                if c in _token_dict:
                    self.token_dict[c] = len(self.token_dict)
                    self.keep_words.append(_token_dict[c])

            self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器

            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "wb") as f:
                pickle.dump(self.tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "wb") as f:
                pickle.dump(self.keep_words, f)

        else:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "rb") as f:
                self.tokenizer = pickle.load(f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "rb") as f:
                self.keep_words = pickle.load(f)

        self.model = self.make_model()

    def make_model(self):
        model = load_pretrained_model(config_path,
                                      checkpoint_path,
                                      keep_words=self.keep_words,
                                      albert=True)

        output = Lambda(lambda x: x[:, 0])(model.output)
        # print(output.shape)
        output = Dense(1,
                       activation='sigmoid')(output)  # tanh, sigmoid, softmax
        model = Model(inputs=model.input, outputs=output)

        model.compile(
            loss=
            'binary_crossentropy',  # categorical_crossentropy binary_crossentropy
            optimizer=Adam(2e-6),  # 用足够小的学习率
            # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
            metrics=['accuracy'])
        model.summary()
        return model

    def gnerator_data(self, file_name):

        X1, X2, Y = [], [], []
        while True:
            for text1, text2, label in read_datas(file_name):

                text1 = text1[:INPUT_LENGTH]
                text2 = text2[:INPUT_LENGTH]
                text1 = unicodedata.normalize('NFKD', text1).strip().lower()
                text2 = unicodedata.normalize('NFKD', text2).strip().lower()
                x1, x2 = self.tokenizer.encode(first=text1, second=text2)
                y = int(label)

                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                # Y.append(to_categorical(y))
                if len(X1) == self.batch_size:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    # print(X1.shape, X2.shape, Y.shape)
                    yield [X1, X2], Y
                    X1, X2, Y = [], [], []

    def train(self):

        early_stopping = EarlyStopping(monitor='val_loss', patience=3)

        model_checkpoint = ModelCheckpoint(filepath=os.path.join(
            model_save_path,
            'similarity-{epoch:02d}-{val_loss:.2f}-{val_acc:.3f}.hdf5'),
                                           save_best_only=True,
                                           save_weights_only=False)

        tb = TensorBoard(
            log_dir=log_dir,  # log 目录
            histogram_freq=0,  # 按照何等频率(epoch)来计算直方图,0为不计算
            batch_size=32,  # 用多大量的数据计算直方图
            write_graph=True,  # 是否存储网络结构图
            write_grads=False,  # 是否可视化梯度直方图
            write_images=False,  # 是否可视化参数
            embeddings_freq=0,
            embeddings_layer_names=None,
            embeddings_metadata=None)

        hist = self.model.fit_generator(
            self.gnerator_data(TRAIN_DATA_FILE),
            steps_per_epoch=1000,
            epochs=100,
            validation_data=self.gnerator_data(DEV_DATA_FILE),
            validation_steps=100,
            callbacks=[early_stopping, model_checkpoint, tb])
        print(hist.history.items())

    def predict(self,
                text1,
                text2,
                weitht_file='similarity-01-0.55-0.741.hdf5'):

        self.model.load_weights(os.path.join(model_save_path, weitht_file),
                                by_name=True,
                                skip_mismatch=True,
                                reshape=True)

        text1 = text1[:INPUT_LENGTH]
        text2 = text2[:INPUT_LENGTH]
        text1 = unicodedata.normalize('NFKD', text1).strip().lower()
        text2 = unicodedata.normalize('NFKD', text2).strip().lower()
        x1, x2 = self.tokenizer.encode(first=text1, second=text2)

        X1 = seq_padding([x1])
        X2 = seq_padding([x2])
        ret = self.model.predict([X1, X2])
        return ret

    def batch_predict(self, question, database):
        text1 = question
        text1 = text1[:INPUT_LENGTH]
        X1, X2 = [], []
        for text2 in database:
            text2 = text2[:INPUT_LENGTH]
            text1 = unicodedata.normalize('NFKD', text1).strip().lower()
            text2 = unicodedata.normalize('NFKD', text2).strip().lower()
            x1, x2 = self.tokenizer.encode(first=text1, second=text2)
            X1.append(x1)
            X2.append(x2)
        X1 = seq_padding(X1)
        X2 = seq_padding(X2)
        ret = self.model.predict([X1, X2])

        return ret
import os
from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

albert_model_path = '/home/gswyhq/github_projects/albert_zh/albert_large_zh'
# albert_model_path = '/notebooks/albert_zh/albert_large_zh'
# https://storage.googleapis.com/albert_zh/albert_large_zh.zip

config_path = os.path.join(albert_model_path, 'albert_config_large.json')
checkpoint_path = os.path.join(albert_model_path, 'albert_model.ckpt')
dict_path = os.path.join(albert_model_path, 'vocab.txt')

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              with_mlm=True)  # 建立模型,加载权重

# token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')
token_ids, segment_ids = tokenizer.encode(u'中国的首都是北京')

print('token_ids: {}, segment_ids: {}'.format(token_ids, segment_ids))

# mask掉“技术”
# token_ids[3] = token_ids[4] = token_dict['[MASK]']
token_ids[4] = token_ids[5] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
# print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
Beispiel #18
0
class AlbertClassify:
    def __init__(self,
                 initial_model=True,
                 model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')):
        self.initial_model = initial_model
        token_dict = load_vocab(DICT_PATH)
        self.tokenizer = SimpleTokenizer(token_dict)
        self.model_path = model_path
        if initial_model:
            self.albert_model = load_pretrained_model(
                CONFIG_PATH,
                CHECKPOINT_PATH,
                # keep_words=keep_words,
                albert=True)
        else:
            self.load(model_path)

        for l in self.albert_model.layers:
            l.trainable = True

    def train(self, train_data, valid_data):
        train_D = DataGenerator(train_data, self.tokenizer,
                                CONFIG['batch_size'], CONFIG['max_len'])
        valid_D = DataGenerator(valid_data, self.tokenizer,
                                CONFIG['batch_size'], CONFIG['max_len'])

        output = Lambda(lambda x: x[:, 0])(self.albert_model.output)
        output = Dense(1, activation='sigmoid')(output)
        self.model = Model(self.albert_model.input, output)

        save = ModelCheckpoint(os.path.join(self.model_path),
                               monitor='val_acc',
                               verbose=1,
                               save_best_only=True,
                               mode='auto')
        early_stopping = EarlyStopping(monitor='val_acc',
                                       min_delta=0,
                                       patience=3,
                                       verbose=1,
                                       mode='auto')
        callbacks = [save, early_stopping]

        if self.initial_model:
            x1_in = Input(shape=(None, ))
            x2_in = Input(shape=(None, ))

            x_in = self.albert_model([x1_in, x2_in])
            x_in = Lambda(lambda x: x[:, 0])(x_in)
            p = Dense(1, activation='sigmoid')(x_in)
            self.model = Model([x1_in, x2_in], p)
        else:
            self.model = self.albert_model

        self.model.compile(
            loss='binary_crossentropy',
            # optimizer=RAdam(1e-5),  # 用足够小的学习率
            optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {
                1000: 1e-5,
                2000: 6e-5
            }),
            metrics=[
                'accuracy', process.get_precision, process.get_recall,
                process.get_f1
            ])
        self.model.summary()

        self.model.fit_generator(
            train_D.__iter__(),
            steps_per_epoch=len(train_D),
            epochs=CONFIG['epochs'],
            validation_data=valid_D.__iter__(),
            validation_steps=len(valid_D),
            callbacks=callbacks,
            use_multiprocessing=CONFIG['use_multiprocessing'],
        )

    def predict(self, test_data):
        """
        预测
        :param test_data:
        :return:
        """
        X1 = []
        X2 = []
        for s in test_data:
            x1, x2 = self.tokenizer.encode(first=s[:CONFIG['max_len']])
            X1.append(x1)
            X2.append(x2)
        X1 = seq_padding(X1)
        X2 = seq_padding(X2)
        predict_results = self.model.predict([X1, X2])
        return predict_results

    def load(self, model_path):
        """
        load the pre-trained model
        """
        try:
            self.albert_model = load_model(str(model_path),
                                           custom_objects=get_custom_objects(),
                                           compile=False)
        except Exception as ex:
            print('load error')
        return self