Esempio n. 1
0
def process_data(train_file, dev_file, test_file):
    chars = set()

    train_datas = read_data(train_file)
    dev_datas = read_data(dev_file)
    test_datas = read_data(test_file)
    for text1, text2, label in train_datas + dev_datas:
        chars.update(set(text1))
        chars.update(set(text2))

    _token_dict = load_vocab(dict_path)  # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])

    tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    return train_datas, dev_datas, test_datas, tokenizer, keep_words
 def albert_process_data(self, mode='part'):
     _token_dict = load_vocab(self.albert_dict_path)  # 读取字典
     # 只取涉及数据集中出现的字
     if mode == 'part':
         train_df = pd.read_csv(self.train_data_path,
                                names=['seq1', 'seq2', 'label'])
         valid_df = pd.read_csv(self.dev_data_path,
                                names=['seq1', 'seq2', 'label'])
         test_df = pd.read_csv(self.test_data_path,
                               names=['seq1', 'seq2', 'label'])
         # total data
         tmp_df = pd.concat([train_df, valid_df, test_df])
         chars = defaultdict(int)
         for _, tmp_row in tmp_df.iterrows():
             for tmp_char in tmp_row.seq1:
                 chars[tmp_char] += 1
             for tmp_char in tmp_row.seq2:
                 chars[tmp_char] += 1
         # 过滤低频字
         chars = {i: j for i, j in chars.items() if j >= 4}
         self.token_dict, self.keep_words = {}, []  # keep_words是在bert中保留的字表
         # 保留特殊字符
         for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
             self.token_dict[c] = len(self.token_dict)
             self.keep_words.append(_token_dict[c])
         # 字典只保留数据中出现的高频字
         for c in chars:
             if c in _token_dict:
                 self.token_dict[c] = len(self.token_dict)
                 self.keep_words.append(_token_dict[c])
     elif mode == 'full':
         self.token_dict, self.keep_words = _token_dict, []
         for k in self.token_dict:
             self.keep_words.append(self.token_dict[k])
     self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器
Esempio n. 3
0
    def save_vocab(self, input_data, incremental_train=False):
        relationships = set()
        chars = set()
        for (text, triple), (entity_lists, rel) in input_data:
            chars.update(set(text))
            relationships.add(rel)
            relationships.update(set(p for s, p, o in triple))

        token_dict = load_vocab(dict_path)  # 读取词典

        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict) # 建立分词器

        # keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']

        rel2id = {rel: _id + 1 for _id, rel in enumerate(sorted(relationships))}
        rel2id['unk'] = 0

        if not incremental_train:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
                pickle.dump(tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
                pickle.dump(keep_words, f)

            with open(os.path.join(model_save_path, 'rel2id.pkl'), "wb") as f:
                pickle.dump(rel2id, f)

        self.tokenizer, self.keep_words, self.rel2id = tokenizer, keep_words, rel2id
        return tokenizer, keep_words, rel2id
def process_data(data_file='./data/classify_data.txt'):
    with open(data_file, encoding='utf-8')as f:
        datas = f.readlines()

    chars = set()
    labels = set()
    new_datas = []
    for data in datas:
        data = data.strip()
        if not data:
            continue
        text, label = data.rsplit(maxsplit=1)
        chars.update(set(text))
        labels.add(label)
        new_datas.append([text, label])
    del datas

    label2id = {lab: i for i, lab in enumerate(list(labels))}

    _token_dict = load_vocab(dict_path) # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])


    tokenizer = Tokenizer(token_dict) # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f:
        pickle.dump(label2id, f)

    if not os.path.exists('./random_order.json'):
        random_order = [i for i in range(len(new_datas))]
        random.shuffle(random_order)
        json.dump(
            random_order,
            open('./random_order.json', 'w'),
            indent=4
        )
    else:
        random_order = json.load(open('./random_order.json'))

    # 按照9:1的比例划分训练集和验证集
    train_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 == 0]

    return train_data, valid_data, tokenizer, keep_words, label2id
Esempio n. 5
0
def process_data(neg_file='datasets/neg.xls', pos_file='datasets/pos.xls'):
    neg = pd.read_excel(neg_file, header=None)
    pos = pd.read_excel(pos_file, header=None)
    chars = {}

    data = []

    for d in neg[0]:
        data.append((d, 0))
        for c in d:
            chars[c] = chars.get(c, 0) + 1

    for d in pos[0]:
        data.append((d, 1))
        for c in d:
            chars[c] = chars.get(c, 0) + 1

    chars = {i: j for i, j in chars.items() if j >= 4}

    _token_dict = load_vocab(dict_path)  # 读取词典
    token_dict, keep_words = {}, set()

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.add(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.add(_token_dict[c])

    keep_words.add(max(keep_words) + 1)
    keep_words = list(keep_words)

    tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
        pickle.dump(tokenizer, f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
        pickle.dump(keep_words, f)

    if not os.path.exists('./random_order.json'):
        random_order = [i for i in range(len(data))]
        random.shuffle(random_order)
        json.dump(random_order, open('./random_order.json', 'w'), indent=4)
    else:
        random_order = json.load(open('./random_order.json'))

    # 按照9:1的比例划分训练集和验证集
    train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

    return train_data, valid_data, tokenizer, keep_words
Esempio n. 6
0
def get_token_dict_and_keep_words():
    _token_dict = load_vocab(dict_path) # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in build_word_list():
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])
    print("token_dict:", token_dict)
    print("keep_words:", keep_words, "size len", len(token_dict))
    return token_dict, keep_words
Esempio n. 7
0
def build_vocab(config):
    """将自定义词典加入bert的词典中"""
    # 读取词典
    _token_dict = load_vocab(config.bert_vocab)
    # keep_words是在bert中保留的字表
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])
    chars = build_custom_vocab(config)
    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])
    return token_dict, keep_words
Esempio n. 8
0
    def save_vocab(self, save_path, process_data):
        chars = set()
        relationships = set()
        for text, relationship in process_data:
            words = split_text(text)
            chars.update(set(words))
            relationships.add(relationship)

        token_dict = load_vocab(dict_path)  # 读取词典
        keep_chars = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']

        for char in chars:
            if not token_dict.get(char):
                # token_dict[char] = len(token_dict)
                keep_chars.append(char)

        # for char in keep_chars:
        #     if not token_dict.get(char):
        #         token_dict[char] = len(token_dict)

        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

        word2id = {
            word: id_ + len(keep_chars)
            for id_, word in enumerate(chars)
        }
        for _id, word in enumerate(keep_chars):
            word2id[word] = _id

        rel2id = {rel: _id for _id, rel in enumerate(relationships)}

        with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
            pickle.dump(tokenizer, f)

        with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
            pickle.dump(keep_words, f)

        with open(os.path.join(save_path, 'word2id.pkl'), "wb") as f:
            pickle.dump(word2id, f)
        with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f:
            pickle.dump(rel2id, f)

        self.tokenizer, self.word2id, self.rel2id = tokenizer, word2id, rel2id

        return tokenizer, keep_words, word2id, rel2id
Esempio n. 9
0
def simplify_vocab_dict():
    import json
    chars = dict()

    min_count = 1

    model_pre_save_path = join(MODEL_PATH, 'train_pre')
    if not os.path.isdir(model_pre_save_path):
        os.makedirs(model_pre_save_path)

    data = get_train()
    for _, pos, neg in data:
        for sentence in pos:
            for w in sentence:
                chars[w] = chars.get(w, 0) + 1
        for sentence in neg:
            for w in sentence:
                chars[w] = chars.get(w, 0) + 1

    chars = [(i, j) for i, j in chars.items() if j >= min_count]
    chars = sorted(chars, key=lambda c: -c[1])
    chars = [c[0] for c in chars]
    json.dump(chars,
              open(join(model_pre_save_path, 'chars.dict'),
                   'w',
                   encoding='utf-8'),
              indent=4,
              ensure_ascii=False)

    # checkpoint_path = os.path.join(main_path, 'model/bert/bert_model.ckpt')
    dict_path = os.path.join(DATA_PATH, 'bert_roberta/vocab.txt')

    _token_dict = load_vocab(dict_path)  # 读取词典
    token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])
    print('len of keep_words: ', len(keep_words))
    joblib.dump(token_dict, join(model_pre_save_path, 'token_dict.joblib'))
    joblib.dump(keep_words, join(model_pre_save_path, 'keep_words.joblib'))
    def save_vocab(self, save_path, process_data):
        flags = set()
        relationships = set()
        for old_word_flag, relationship in process_data:
            word_flag = []
            for word, flag in old_word_flag:
                # if flag[0] == 'B':
                #     flag = 'B-Shiyi'
                # elif flag[0] == 'I':
                #     flag = 'I-Shiyi'
                word_flag.append([word, flag])
            flags.update(set(flag for word, flag in word_flag))
            relationships.add(relationship)

        token_dict = load_vocab(dict_path)  # 读取词典

        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

        keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']
        flag2id = {
            label: id_ + len(keep_flags)
            for id_, label in enumerate(
                sorted(flags, key=lambda x: 0 if x == 'O' else 1))
        }
        for flag_id, flag in enumerate(keep_flags):
            flag2id[flag] = flag_id

        rel2id = {rel: _id for _id, rel in enumerate(relationships)}

        with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
            pickle.dump(tokenizer, f)

        with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
            pickle.dump(keep_words, f)

        with open(os.path.join(save_path, 'flag2id.pkl'), "wb") as f:
            pickle.dump(flag2id, f)
        with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f:
            pickle.dump(rel2id, f)

        self.tokenizer, self.flag2id, self.rel2id = tokenizer, flag2id, rel2id

        return tokenizer, keep_words, flag2id, rel2id
Esempio n. 11
0
    def __init__(self,
                 initial_model=True,
                 model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')):
        self.initial_model = initial_model
        token_dict = load_vocab(DICT_PATH)
        self.tokenizer = SimpleTokenizer(token_dict)
        self.model_path = model_path
        if initial_model:
            self.albert_model = load_pretrained_model(
                CONFIG_PATH,
                CHECKPOINT_PATH,
                # keep_words=keep_words,
                albert=True)
        else:
            self.load(model_path)

        for l in self.albert_model.layers:
            l.trainable = True
Esempio n. 12
0
    def save_vocab(self, model_save_path, process_data):
        chars = set()
        labels = set()
        for char_labels in process_data:
            for char, label in char_labels:
                chars.add(char)
                labels.add(label)

        _token_dict = load_vocab(dict_path)  # 读取词典
        token_dict, keep_words = {}, set()

        for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
            token_dict[c] = len(token_dict)
            keep_words.add(_token_dict[c])

        for c in chars:
            if c in _token_dict:
                token_dict[c] = len(token_dict)
                keep_words.add(_token_dict[c])

        keep_words.add(max(keep_words) + 1)
        keep_words = list(keep_words)
        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器

        with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f:
            pickle.dump(tokenizer, f)

        with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f:
            pickle.dump(keep_words, f)

        # print('labels={}'.format(labels))
        # sorted: 保证 非实体词 O 的id为0
        self.label2id = {
            label: id_
            for id_, label in enumerate(
                sorted(labels, key=lambda x: 0 if x == 'O' else 1))
        }
        print('label2id: {}'.format(self.label2id))
        with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f:
            pickle.dump(self.label2id, f)

        self.keep_words = keep_words
        self.tokenizer = tokenizer
Esempio n. 13
0
    def __init__(self, batch_size=32, train=False):
        self.batch_size = batch_size
        if train:
            chars = set()
            train_datas = read_datas(TRAIN_DATA_FILE)
            dev_datas = read_datas(DEV_DATA_FILE)
            test_datas = read_datas(TEST_DATA_FILE)
            for text1, text2, label in itertools.chain(train_datas, dev_datas):
                chars.update(set(text1))
                chars.update(set(text2))

            _token_dict = load_vocab(dict_path)  # 读取词典
            self.token_dict, self.keep_words = {}, []

            for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
                self.token_dict[c] = len(self.token_dict)
                self.keep_words.append(_token_dict[c])

            for c in chars:
                if c in _token_dict:
                    self.token_dict[c] = len(self.token_dict)
                    self.keep_words.append(_token_dict[c])

            self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器

            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "wb") as f:
                pickle.dump(self.tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "wb") as f:
                pickle.dump(self.keep_words, f)

        else:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "rb") as f:
                self.tokenizer = pickle.load(f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "rb") as f:
                self.keep_words = pickle.load(f)

        self.model = self.make_model()
Esempio n. 14
0
    def save_word2id_etc(self, datas, incremental_train=False):

        label_set = set()

        _token_dict = load_vocab(dict_path)  # 读取词典
        # token_dict, keep_words = {}, set()
        token_dict = copy.deepcopy(_token_dict)
        # for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        #     token_dict[c] = len(token_dict)
        #     keep_words.add(_token_dict[c])

        for chars, label in datas:
            label_set.add(label)
            # for c in chars:
            #     if c in _token_dict:
            #         token_dict[c] = len(token_dict)
            #         keep_words.add(_token_dict[c])

        # keep_words.add(max(keep_words)+1)
        # keep_words = list(keep_words)
        keep_words = list(set(token_dict.values()))

        tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
        label2id = {lab: i for i, lab in enumerate(list(label_set))}

        if not incremental_train:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "wb") as f:
                pickle.dump(tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "wb") as f:
                pickle.dump(keep_words, f)

            with open(os.path.join(model_save_path, 'label2id.pkl'),
                      "wb") as f:
                pickle.dump(label2id, f)

        return tokenizer, keep_words, label2id
Esempio n. 15
0
def load_myvocab(dataset):
    if os.path.exists(MY_VOCAB_FILE):
        chars = json.load(open(MY_VOCAB_FILE, encoding='utf-8'))
    else:
        chars = {}
        x_train, y_train, x_val, y_val = dataset.get_all_data()
        x_data = np.concatenate((x_train, x_val))
        y_data = np.concatenate((y_train, y_val))

        for q in tqdm(x_data, desc=u'构建字表中_处理问题'):
            for w in q["que_text"]:  # 纯文本,不用分词
                chars[w] = chars.get(w, 0) + 1
        for a in tqdm(y_data, desc=u'构建字表中_处理回答'):
            for w in a["ans_text"]:  # 纯文本,不用分词
                chars[w] = chars.get(w, 0) + 1

        chars = [(char, count) for char, count in chars.items() if count >= min_count]
        chars = sorted(chars, key=lambda c: - c[1])
        chars = [c[0] for c in chars]
        json.dump(
            chars,
            codecs.open(MY_VOCAB_FILE, 'w', encoding='utf-8'),
            indent=4,
            ensure_ascii=False
        )

    _token_dict = load_vocab(VOCAB_FILE)  # 读取词典
    token_dict, keep_words = {}, []

    for c in ['[PAD]', '[UNK]', '[CLS]', '[unused1]', '[SEP]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])

    return token_dict, keep_words
Esempio n. 16
0
import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')

for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

print(tf.__version__)

base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\'
config_path = base_path + 'bert_config.json'
checkpoint_path = base_path + 'bert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
Esempio n. 17
0
from bert4keras.utils import SimpleTokenizer, load_vocab

if __name__ == '__main__':
    _token_dict = load_vocab(
        '/Data/public/Bert/albert_tiny_250k/vocab.txt')  # 读取字典

    print(type(_token_dict))
    print(_token_dict)