def process_data(train_file, dev_file, test_file): chars = set() train_datas = read_data(train_file) dev_datas = read_data(dev_file) test_datas = read_data(test_file) for text1, text2, label in train_datas + dev_datas: chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) return train_datas, dev_datas, test_datas, tokenizer, keep_words
def albert_process_data(self, mode='part'): _token_dict = load_vocab(self.albert_dict_path) # 读取字典 # 只取涉及数据集中出现的字 if mode == 'part': train_df = pd.read_csv(self.train_data_path, names=['seq1', 'seq2', 'label']) valid_df = pd.read_csv(self.dev_data_path, names=['seq1', 'seq2', 'label']) test_df = pd.read_csv(self.test_data_path, names=['seq1', 'seq2', 'label']) # total data tmp_df = pd.concat([train_df, valid_df, test_df]) chars = defaultdict(int) for _, tmp_row in tmp_df.iterrows(): for tmp_char in tmp_row.seq1: chars[tmp_char] += 1 for tmp_char in tmp_row.seq2: chars[tmp_char] += 1 # 过滤低频字 chars = {i: j for i, j in chars.items() if j >= 4} self.token_dict, self.keep_words = {}, [] # keep_words是在bert中保留的字表 # 保留特殊字符 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) # 字典只保留数据中出现的高频字 for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) elif mode == 'full': self.token_dict, self.keep_words = _token_dict, [] for k in self.token_dict: self.keep_words.append(self.token_dict[k]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器
def save_vocab(self, input_data, incremental_train=False): relationships = set() chars = set() for (text, triple), (entity_lists, rel) in input_data: chars.update(set(text)) relationships.add(rel) relationships.update(set(p for s, p, o in triple)) token_dict = load_vocab(dict_path) # 读取词典 keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 # keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] rel2id = {rel: _id + 1 for _id, rel in enumerate(sorted(relationships))} rel2id['unk'] = 0 if not incremental_train: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.keep_words, self.rel2id = tokenizer, keep_words, rel2id return tokenizer, keep_words, rel2id
def process_data(data_file='./data/classify_data.txt'): with open(data_file, encoding='utf-8')as f: datas = f.readlines() chars = set() labels = set() new_datas = [] for data in datas: data = data.strip() if not data: continue text, label = data.rsplit(maxsplit=1) chars.update(set(text)) labels.add(label) new_datas.append([text, label]) del datas label2id = {lab: i for i, lab in enumerate(list(labels))} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = Tokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(label2id, f) if not os.path.exists('./random_order.json'): random_order = [i for i in range(len(new_datas))] random.shuffle(random_order) json.dump( random_order, open('./random_order.json', 'w'), indent=4 ) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 == 0] return train_data, valid_data, tokenizer, keep_words, label2id
def process_data(neg_file='datasets/neg.xls', pos_file='datasets/pos.xls'): neg = pd.read_excel(neg_file, header=None) pos = pd.read_excel(pos_file, header=None) chars = {} data = [] for d in neg[0]: data.append((d, 0)) for c in d: chars[c] = chars.get(c, 0) + 1 for d in pos[0]: data.append((d, 1)) for c in d: chars[c] = chars.get(c, 0) + 1 chars = {i: j for i, j in chars.items() if j >= 4} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, set() for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) keep_words.add(max(keep_words) + 1) keep_words = list(keep_words) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) if not os.path.exists('./random_order.json'): random_order = [i for i in range(len(data))] random.shuffle(random_order) json.dump(random_order, open('./random_order.json', 'w'), indent=4) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] return train_data, valid_data, tokenizer, keep_words
def get_token_dict_and_keep_words(): _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in build_word_list(): if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) print("token_dict:", token_dict) print("keep_words:", keep_words, "size len", len(token_dict)) return token_dict, keep_words
def build_vocab(config): """将自定义词典加入bert的词典中""" # 读取词典 _token_dict = load_vocab(config.bert_vocab) # keep_words是在bert中保留的字表 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) chars = build_custom_vocab(config) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) return token_dict, keep_words
def save_vocab(self, save_path, process_data): chars = set() relationships = set() for text, relationship in process_data: words = split_text(text) chars.update(set(words)) relationships.add(relationship) token_dict = load_vocab(dict_path) # 读取词典 keep_chars = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] for char in chars: if not token_dict.get(char): # token_dict[char] = len(token_dict) keep_chars.append(char) # for char in keep_chars: # if not token_dict.get(char): # token_dict[char] = len(token_dict) keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 word2id = { word: id_ + len(keep_chars) for id_, word in enumerate(chars) } for _id, word in enumerate(keep_chars): word2id[word] = _id rel2id = {rel: _id for _id, rel in enumerate(relationships)} with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(save_path, 'word2id.pkl'), "wb") as f: pickle.dump(word2id, f) with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.word2id, self.rel2id = tokenizer, word2id, rel2id return tokenizer, keep_words, word2id, rel2id
def simplify_vocab_dict(): import json chars = dict() min_count = 1 model_pre_save_path = join(MODEL_PATH, 'train_pre') if not os.path.isdir(model_pre_save_path): os.makedirs(model_pre_save_path) data = get_train() for _, pos, neg in data: for sentence in pos: for w in sentence: chars[w] = chars.get(w, 0) + 1 for sentence in neg: for w in sentence: chars[w] = chars.get(w, 0) + 1 chars = [(i, j) for i, j in chars.items() if j >= min_count] chars = sorted(chars, key=lambda c: -c[1]) chars = [c[0] for c in chars] json.dump(chars, open(join(model_pre_save_path, 'chars.dict'), 'w', encoding='utf-8'), indent=4, ensure_ascii=False) # checkpoint_path = os.path.join(main_path, 'model/bert/bert_model.ckpt') dict_path = os.path.join(DATA_PATH, 'bert_roberta/vocab.txt') _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) print('len of keep_words: ', len(keep_words)) joblib.dump(token_dict, join(model_pre_save_path, 'token_dict.joblib')) joblib.dump(keep_words, join(model_pre_save_path, 'keep_words.joblib'))
def save_vocab(self, save_path, process_data): flags = set() relationships = set() for old_word_flag, relationship in process_data: word_flag = [] for word, flag in old_word_flag: # if flag[0] == 'B': # flag = 'B-Shiyi' # elif flag[0] == 'I': # flag = 'I-Shiyi' word_flag.append([word, flag]) flags.update(set(flag for word, flag in word_flag)) relationships.add(relationship) token_dict = load_vocab(dict_path) # 读取词典 keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] flag2id = { label: id_ + len(keep_flags) for id_, label in enumerate( sorted(flags, key=lambda x: 0 if x == 'O' else 1)) } for flag_id, flag in enumerate(keep_flags): flag2id[flag] = flag_id rel2id = {rel: _id for _id, rel in enumerate(relationships)} with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(save_path, 'flag2id.pkl'), "wb") as f: pickle.dump(flag2id, f) with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.flag2id, self.rel2id = tokenizer, flag2id, rel2id return tokenizer, keep_words, flag2id, rel2id
def __init__(self, initial_model=True, model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')): self.initial_model = initial_model token_dict = load_vocab(DICT_PATH) self.tokenizer = SimpleTokenizer(token_dict) self.model_path = model_path if initial_model: self.albert_model = load_pretrained_model( CONFIG_PATH, CHECKPOINT_PATH, # keep_words=keep_words, albert=True) else: self.load(model_path) for l in self.albert_model.layers: l.trainable = True
def save_vocab(self, model_save_path, process_data): chars = set() labels = set() for char_labels in process_data: for char, label in char_labels: chars.add(char) labels.add(label) _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, set() for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) keep_words.add(max(keep_words) + 1) keep_words = list(keep_words) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) # print('labels={}'.format(labels)) # sorted: 保证 非实体词 O 的id为0 self.label2id = { label: id_ for id_, label in enumerate( sorted(labels, key=lambda x: 0 if x == 'O' else 1)) } print('label2id: {}'.format(self.label2id)) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(self.label2id, f) self.keep_words = keep_words self.tokenizer = tokenizer
def __init__(self, batch_size=32, train=False): self.batch_size = batch_size if train: chars = set() train_datas = read_datas(TRAIN_DATA_FILE) dev_datas = read_datas(DEV_DATA_FILE) test_datas = read_datas(TEST_DATA_FILE) for text1, text2, label in itertools.chain(train_datas, dev_datas): chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 self.token_dict, self.keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(self.tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(self.keep_words, f) else: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f: self.tokenizer = pickle.load(f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f: self.keep_words = pickle.load(f) self.model = self.make_model()
def save_word2id_etc(self, datas, incremental_train=False): label_set = set() _token_dict = load_vocab(dict_path) # 读取词典 # token_dict, keep_words = {}, set() token_dict = copy.deepcopy(_token_dict) # for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: # token_dict[c] = len(token_dict) # keep_words.add(_token_dict[c]) for chars, label in datas: label_set.add(label) # for c in chars: # if c in _token_dict: # token_dict[c] = len(token_dict) # keep_words.add(_token_dict[c]) # keep_words.add(max(keep_words)+1) # keep_words = list(keep_words) keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 label2id = {lab: i for i, lab in enumerate(list(label_set))} if not incremental_train: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(label2id, f) return tokenizer, keep_words, label2id
def load_myvocab(dataset): if os.path.exists(MY_VOCAB_FILE): chars = json.load(open(MY_VOCAB_FILE, encoding='utf-8')) else: chars = {} x_train, y_train, x_val, y_val = dataset.get_all_data() x_data = np.concatenate((x_train, x_val)) y_data = np.concatenate((y_train, y_val)) for q in tqdm(x_data, desc=u'构建字表中_处理问题'): for w in q["que_text"]: # 纯文本,不用分词 chars[w] = chars.get(w, 0) + 1 for a in tqdm(y_data, desc=u'构建字表中_处理回答'): for w in a["ans_text"]: # 纯文本,不用分词 chars[w] = chars.get(w, 0) + 1 chars = [(char, count) for char, count in chars.items() if count >= min_count] chars = sorted(chars, key=lambda c: - c[1]) chars = [c[0] for c in chars] json.dump( chars, codecs.open(MY_VOCAB_FILE, 'w', encoding='utf-8'), indent=4, ensure_ascii=False ) _token_dict = load_vocab(VOCAB_FILE) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[unused1]', '[SEP]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) return token_dict, keep_words
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: print("Name:", gpu.name, " Type:", gpu.device_type) tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]) print(tf.__version__) base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\' config_path = base_path + 'bert_config.json' checkpoint_path = base_path + 'bert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
from bert4keras.utils import SimpleTokenizer, load_vocab if __name__ == '__main__': _token_dict = load_vocab( '/Data/public/Bert/albert_tiny_250k/vocab.txt') # 读取字典 print(type(_token_dict)) print(_token_dict)