def process_data(train_file, dev_file, test_file): chars = set() train_datas = read_data(train_file) dev_datas = read_data(dev_file) test_datas = read_data(test_file) for text1, text2, label in train_datas + dev_datas: chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) return train_datas, dev_datas, test_datas, tokenizer, keep_words
def albert_process_data(self, mode='part'): _token_dict = load_vocab(self.albert_dict_path) # 读取字典 # 只取涉及数据集中出现的字 if mode == 'part': train_df = pd.read_csv(self.train_data_path, names=['seq1', 'seq2', 'label']) valid_df = pd.read_csv(self.dev_data_path, names=['seq1', 'seq2', 'label']) test_df = pd.read_csv(self.test_data_path, names=['seq1', 'seq2', 'label']) # total data tmp_df = pd.concat([train_df, valid_df, test_df]) chars = defaultdict(int) for _, tmp_row in tmp_df.iterrows(): for tmp_char in tmp_row.seq1: chars[tmp_char] += 1 for tmp_char in tmp_row.seq2: chars[tmp_char] += 1 # 过滤低频字 chars = {i: j for i, j in chars.items() if j >= 4} self.token_dict, self.keep_words = {}, [] # keep_words是在bert中保留的字表 # 保留特殊字符 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) # 字典只保留数据中出现的高频字 for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) elif mode == 'full': self.token_dict, self.keep_words = _token_dict, [] for k in self.token_dict: self.keep_words.append(self.token_dict[k]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器
def save_vocab(self, input_data, incremental_train=False): relationships = set() chars = set() for (text, triple), (entity_lists, rel) in input_data: chars.update(set(text)) relationships.add(rel) relationships.update(set(p for s, p, o in triple)) token_dict = load_vocab(dict_path) # 读取词典 keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 # keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] rel2id = {rel: _id + 1 for _id, rel in enumerate(sorted(relationships))} rel2id['unk'] = 0 if not incremental_train: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.keep_words, self.rel2id = tokenizer, keep_words, rel2id return tokenizer, keep_words, rel2id
def get_correct_fn(): save_path = join(MODEL_PATH, 'detect') token_dict = joblib.load( join(MODEL_PATH, 'train_pre_for_error_detect', 'token_dict.joblib')) tokenizer = SimpleTokenizer(token_dict) keep_words = joblib.load( join(MODEL_PATH, 'train_pre_for_error_detect', 'keep_words.joblib')) model = DetectModel(keep_words=keep_words) model.compile() model.model.load_weights(join(save_path, 'weights.hdf5')) checker = Statistics() def correct(error_text): text_tokens = tokenizer.tokenize(error_text, False, False)[:ec_cfg.max_seq_len - 2] tokens = list() tokens.append("[CLS]") for token in text_tokens: tokens.append(token) tokens.append("[SEP]") input_ids = [ token_dict[c] if c in token_dict.keys() else token_dict['[UNK]'] for c in tokens ] while len(input_ids) < ec_cfg.max_seq_len: input_ids.append(0) seg_ids = np.zeros_like(input_ids, dtype=np.int) ids, segs = [input_ids], [seg_ids] res = model.model.predict([ids, segs])[0][1:-1] begins_pred = [] lengths_pred = [] this_len = 0 for i, r in enumerate(res): if np.argmax(r) > 0: if this_len == 0: begins_pred.append(i) this_len += 1 else: if this_len > 0: lengths_pred.append(this_len) this_len = 0 else: if this_len > 0: lengths_pred.append(this_len) res_str = checker.correct(error_text, begins_pred, lengths_pred) return res_str return correct
def process_data(neg_file='datasets/neg.xls', pos_file='datasets/pos.xls'): neg = pd.read_excel(neg_file, header=None) pos = pd.read_excel(pos_file, header=None) chars = {} data = [] for d in neg[0]: data.append((d, 0)) for c in d: chars[c] = chars.get(c, 0) + 1 for d in pos[0]: data.append((d, 1)) for c in d: chars[c] = chars.get(c, 0) + 1 chars = {i: j for i, j in chars.items() if j >= 4} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, set() for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) keep_words.add(max(keep_words) + 1) keep_words = list(keep_words) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) if not os.path.exists('./random_order.json'): random_order = [i for i in range(len(data))] random.shuffle(random_order) json.dump(random_order, open('./random_order.json', 'w'), indent=4) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] return train_data, valid_data, tokenizer, keep_words
def save_vocab(self, save_path, process_data): chars = set() relationships = set() for text, relationship in process_data: words = split_text(text) chars.update(set(words)) relationships.add(relationship) token_dict = load_vocab(dict_path) # 读取词典 keep_chars = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] for char in chars: if not token_dict.get(char): # token_dict[char] = len(token_dict) keep_chars.append(char) # for char in keep_chars: # if not token_dict.get(char): # token_dict[char] = len(token_dict) keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 word2id = { word: id_ + len(keep_chars) for id_, word in enumerate(chars) } for _id, word in enumerate(keep_chars): word2id[word] = _id rel2id = {rel: _id for _id, rel in enumerate(relationships)} with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(save_path, 'word2id.pkl'), "wb") as f: pickle.dump(word2id, f) with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.word2id, self.rel2id = tokenizer, word2id, rel2id return tokenizer, keep_words, word2id, rel2id
def save_vocab(self, save_path, process_data): flags = set() relationships = set() for old_word_flag, relationship in process_data: word_flag = [] for word, flag in old_word_flag: # if flag[0] == 'B': # flag = 'B-Shiyi' # elif flag[0] == 'I': # flag = 'I-Shiyi' word_flag.append([word, flag]) flags.update(set(flag for word, flag in word_flag)) relationships.add(relationship) token_dict = load_vocab(dict_path) # 读取词典 keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] flag2id = { label: id_ + len(keep_flags) for id_, label in enumerate( sorted(flags, key=lambda x: 0 if x == 'O' else 1)) } for flag_id, flag in enumerate(keep_flags): flag2id[flag] = flag_id rel2id = {rel: _id for _id, rel in enumerate(relationships)} with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(save_path, 'flag2id.pkl'), "wb") as f: pickle.dump(flag2id, f) with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.flag2id, self.rel2id = tokenizer, flag2id, rel2id return tokenizer, keep_words, flag2id, rel2id
def __init__(self, initial_model=True, model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')): self.initial_model = initial_model token_dict = load_vocab(DICT_PATH) self.tokenizer = SimpleTokenizer(token_dict) self.model_path = model_path if initial_model: self.albert_model = load_pretrained_model( CONFIG_PATH, CHECKPOINT_PATH, # keep_words=keep_words, albert=True) else: self.load(model_path) for l in self.albert_model.layers: l.trainable = True
def save_vocab(self, model_save_path, process_data): chars = set() labels = set() for char_labels in process_data: for char, label in char_labels: chars.add(char) labels.add(label) _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, set() for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) keep_words.add(max(keep_words) + 1) keep_words = list(keep_words) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) # print('labels={}'.format(labels)) # sorted: 保证 非实体词 O 的id为0 self.label2id = { label: id_ for id_, label in enumerate( sorted(labels, key=lambda x: 0 if x == 'O' else 1)) } print('label2id: {}'.format(self.label2id)) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(self.label2id, f) self.keep_words = keep_words self.tokenizer = tokenizer
def __init__(self, batch_size=32, train=False): self.batch_size = batch_size if train: chars = set() train_datas = read_datas(TRAIN_DATA_FILE) dev_datas = read_datas(DEV_DATA_FILE) test_datas = read_datas(TEST_DATA_FILE) for text1, text2, label in itertools.chain(train_datas, dev_datas): chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 self.token_dict, self.keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(self.tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(self.keep_words, f) else: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f: self.tokenizer = pickle.load(f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f: self.keep_words = pickle.load(f) self.model = self.make_model()
def save_word2id_etc(self, datas, incremental_train=False): label_set = set() _token_dict = load_vocab(dict_path) # 读取词典 # token_dict, keep_words = {}, set() token_dict = copy.deepcopy(_token_dict) # for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: # token_dict[c] = len(token_dict) # keep_words.add(_token_dict[c]) for chars, label in datas: label_set.add(label) # for c in chars: # if c in _token_dict: # token_dict[c] = len(token_dict) # keep_words.add(_token_dict[c]) # keep_words.add(max(keep_words)+1) # keep_words = list(keep_words) keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 label2id = {lab: i for i, lab in enumerate(list(label_set))} if not incremental_train: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(label2id, f) return tokenizer, keep_words, label2id
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: print("Name:", gpu.name, " Type:", gpu.device_type) tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]) print(tf.__version__) base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\' config_path = base_path + 'bert_config.json' checkpoint_path = base_path + 'bert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print(model.predict([np.array([token_ids]), np.array([segment_ids])]))