class LM_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['lm_vocab']) self.word_featurizer = TextFeaturizer(config['lm_word']) self.init_text_to_vocab() self.batch = config['running_config']['batch_size'] self.epochs = 1 def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: self.train_pick = np.load(os.path.join( outdir, 'dg_state.npy')).flatten().tolist() self.epochs = 1 + int(np.mean(self.train_pick)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self, outdir): np.save(os.path.join(outdir, 'dg_state.npy'), np.array(self.train_pick)) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return (tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768])) def get_per_epoch_steps(self): return len(self.train_texts) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence( config['train_list'] if self.train else config['eval_list'], training=self.train) def init_text_to_vocab(self): pypinyin.load_phrases_dict({ '调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): return pypinyin.lazy_pinyin(txt, 1, errors='ignore') self.text_to_vocab = text_to_vocab_func def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_texts, self.test_texts = train, test self.train_pick = [0] * len(self.train_texts) else: self.test_texts = txts self.offset = 0 def preprocess(self, tokens, txts): x = [] y = [] new = [] for token, txt in zip(tokens, txts): # print(py,txt) if not self.check_valid(token, self.vocab_featurizer.vocab_array): continue if not self.check_valid(txt, self.word_featurizer.vocab_array): continue # try: x_ = [self.vocab_featurizer.startid()] y_ = [self.word_featurizer.startid()] for i in token: x_.append(self.vocab_featurizer.token_to_index[i]) for i in txt: y_.append(self.word_featurizer.token_to_index[i]) x_.append(self.vocab_featurizer.endid()) y_.append(self.word_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) new.append(txt) return x, y, new def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences( bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences( bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return False return True def generate(self, train=True): if train: indexs = np.argsort(self.train_pick)[:2 * self.batch] indexs = random.sample(indexs.tolist(), self.batch) sample = [self.train_texts[i] for i in indexs] for i in indexs: self.train_pick[int(i)] += 1 self.epochs = 1 + int(np.mean(self.train_pick)) else: sample = random.sample(self.test_texts, self.batch) trainx = [self.text_to_vocab(i) for i in sample] trainy = sample x, y, new = self.preprocess(trainx, trainy) e_bert_t, e_bert_s = self.bert_decode(new) e_features = self.get_bert_feature(e_bert_t, e_bert_s) x = self.pad(x) y = self.pad(y) e_features = self.pad(e_features, 2) x = np.array(x) y = np.array(y) e_features = np.array(e_features, dtype='float32') return x, y, e_features def eval_generate(self, ): sample = self.test_texts[self.offset:self.offset + self.batch] self.offset += self.batch trainx = [self.text_to_vocab(i) for i in sample] trainy = sample x, y, new = self.preprocess(trainx, trainy) x = self.pad(x) y = self.pad(y) x = np.array(x, 'int32') y = np.array(y, 'int32') return x, y def generator(self, train=True): while 1: x, y, features = self.generate(train) yield x, y, features
class Punc_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['punc_vocab']) self.bd_featurizer = TextFeaturizer(config['punc_biaodian']) self.bd = self.bd_featurizer.vocab_array self.batch = config['running_config']['batch_size'] self.epochs = 1 def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return (tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768])) def get_per_epoch_steps(self): return len(self.train_texts) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence( config['train_list'] if self.train else config['eval_list'], training=self.train) def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_list, self.test_list = train, test self.train_offset = 0 self.test_offset = 0 else: self.test_texts = txts self.offset = 0 def preprocess(self, txts): x = [] for txt in txts: x_ = [self.vocab_featurizer.startid()] for i in txt: x_.append(self.vocab_featurizer.token_to_index[i]) x_.append(self.vocab_featurizer.endid()) x.append(np.array(x_)) return x def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) elif mode == 3: for i in range(len(x)): pading = np.zeros([length - len(x[i]), x[i].shape[1]]) x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences( bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences( bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def get_target(self, text): bd = self.bd zh = [] bd_ = [[0]] for n in text: if n in bd: bd_[-1].append(bd.index(n)) else: zh.append(n) bd_.append([0]) zh_txt = ''.join(zh) bd_txt = bd_ + [[0]] return zh_txt, bd_txt def process_punc(self, puncs): x = [] for punc in puncs: x_ = [] for i in range(len(punc)): if len(punc[i]) == 1: x_ += [1] else: x_ += punc[i][-1:] x.append(np.array(x_, 'int32')) return x def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def generate(self, train): trainx = [] trainy = [] for i in range(self.batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 line = line.strip() if len(line) < 30: extra = random.sample(self.train_list, 1)[0] extra = extra.strip() line += extra if self.check_valid(line, self.vocab_featurizer.vocab_array + self.bd) is not True: continue try: x, y = self.get_target(line) except: continue trainx.append(x) trainy.append(y) if len(trainx) == self.batch: break inp_tokens = self.preprocess(trainx) e_bert_t, e_bert_s = self.bert_decode(trainx) e_features = self.get_bert_feature(e_bert_t, e_bert_s) trainy = self.process_punc(trainy) inp_tokens = self.pad(inp_tokens) trainy = self.pad(trainy) e_features = self.pad(e_features, 2) inp_tokens = np.array(inp_tokens) trainy = np.array(trainy) e_features = np.array(e_features, dtype='float32') return inp_tokens, trainy, e_features def generator(self, train=True): while 1: x, y, features = self.generate(train) if x.shape[1] != y.shape[1] and y.shape[1] != features.shape[1]: logging.info('bad batch,skip') continue yield x, y, features
class LM_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.for_multi_task=config['am_token']['for_multi_task'] self.am_featurizer = TextFeaturizer(config['am_token']) self.lm_featurizer = TextFeaturizer(config['lm_token']) self.init_text_to_vocab() self.batch = config['running_config']['batch_size'] self.epochs = 1 self.config=config def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info('history train list not equal new load train list ,data loader use init state') self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return ( tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768]) ) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence(config['train_list'] if self.train else config['eval_list'], training=self.train) def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): if self.for_multi_task: pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True) pys = [i[0] for i in pys] return pys else: pys=pypinyin.pinyin(txt) pys=[i[0] for i in pys] return pys self.text_to_vocab = text_to_vocab_func def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_list, self.test_list = train, test self.train_offset=0 self.test_offset=0 else: self.test_texts = txts self.offset = 0 def preprocess(self, tokens, txts): x = [] y = [] new = [] for token, txt in zip(tokens, txts): # print(py,txt) if not self.check_valid(token, self.am_featurizer.vocab_array): logging.info('{} pinyin not all in token,continue'.format(txt)) continue if not self.check_valid(txt, self.lm_featurizer.vocab_array): logging.info('{} not all in token,continue'.format(txt)) continue # try: x_ = [self.am_featurizer.startid()] y_ = [self.lm_featurizer.startid()] for i in token: x_.append(self.am_featurizer.token_to_index[i]) for i in txt: y_.append(self.lm_featurizer.token_to_index[i]) x_.append(self.am_featurizer.endid()) y_.append(self.lm_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) new.append(txt) return x, y, new def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences(bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences(bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def generate(self, train=True): samples=[] x = [] y = [] for i in range(self.batch*10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 txt=line.strip() txt=txt.replace(' ','') if self.config['only_chinese']: txt=self.only_chinese(txt) py=self.text_to_vocab(txt) if self.check_valid(py, self.am_featurizer.vocab_array) is not True: logging.info('{} pinyin {} not in token,skip'.format(txt,self.check_valid(py, self.am_featurizer.vocab_array))) continue if self.check_valid(txt, self.lm_featurizer.vocab_array) is not True: logging.info('{} txt {} not in token,skip'.format(txt,self.check_valid(txt, self.lm_featurizer.vocab_array))) continue x_ = [self.am_featurizer.startid()] y_ = [self.lm_featurizer.startid()] for i in py: x_.append(self.am_featurizer.token_to_index[i]) for i in txt: y_.append(self.lm_featurizer.token_to_index[i]) x_.append(self.am_featurizer.endid()) y_.append(self.lm_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) samples.append(txt) if len(samples)==self.batch: break e_bert_t, e_bert_s = self.bert_decode(samples) e_features = self.get_bert_feature(e_bert_t, e_bert_s) x = self.pad(x) y = self.pad(y) e_features = self.pad(e_features, 2) x = np.array(x) y = np.array(y) e_features = np.array(e_features, dtype='float32') return x, y, e_features def eval_generate(self, ): samples = [] x = [] y = [] for i in range(self.batch * 10): line = self.test_texts[self.offset] self.offset += 1 if self.offset > len(self.test_texts) - 1: self.offset = 0 txt = line.strip() txt = txt.replace(' ', '') if self.config['only_chinese']: txt = self.only_chinese(txt) py = self.text_to_vocab(txt) if self.check_valid(py, self.am_featurizer.vocab_array) is not True: logging.info('{} pinyin {} not in token,skip'.format(txt, self.check_valid(py, self.am_featurizer.vocab_array))) continue if self.check_valid(txt, self.lm_featurizer.vocab_array) is not True: logging.info('{} txt {} not in token,skip'.format(txt, self.check_valid(txt, self.lm_featurizer.vocab_array))) continue x_ = [self.am_featurizer.startid()] y_ = [self.lm_featurizer.startid()] for i in py: x_.append(self.am_featurizer.token_to_index[i]) for i in txt: y_.append(self.lm_featurizer.token_to_index[i]) x_.append(self.am_featurizer.endid()) y_.append(self.lm_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) samples.append(txt) if len(samples) == self.batch: break x = self.pad(x) y = self.pad(y) x = np.array(x, 'int32') y = np.array(y, 'int32') return x, y def generator(self, train=True): while 1: x, y, features = self.generate(train) if len(x) == 0: logging.info('load data length zero,continue') continue yield x, y, features