def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['punc_vocab']) self.bd_featurizer = TextFeaturizer(config['punc_biaodian']) self.bd = self.bd_featurizer.vocab_array self.batch = config['running_config']['batch_size'] self.epochs = 1
def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['lm_vocab']) self.word_featurizer = TextFeaturizer(config['lm_word']) self.init_text_to_vocab() self.batch = config['running_config']['batch_size'] self.epochs = 1
def __init__(self, config, training=True): self.train = training self.init_all(config) self.for_multi_task=config['am_token']['for_multi_task'] self.am_featurizer = TextFeaturizer(config['am_token']) self.lm_featurizer = TextFeaturizer(config['lm_token']) self.init_text_to_vocab() self.batch = config['running_config']['batch_size'] self.epochs = 1 self.config=config
def __init__(self, config): self.config = config self.vocab_featurizer = TextFeaturizer(config['lm_vocab']) self.word_featurizer = TextFeaturizer(config['lm_word']) self.model_config = self.config['model_config'] self.model_config.update({ 'input_vocab_size': self.vocab_featurizer.num_classes, 'target_vocab_size': self.word_featurizer.num_classes })
def __init__(self, config, training=True): self.speech_featurizer = SpeechFeaturizer(config) self.text_featurizer = TextFeaturizer(config) self.config = config self.batch = config['batch_size'] self.make_file_list(self.config['train_list'] if training else self.config['eval_list'], training=training) self.min_value = -self.config['max_abs_value'] self._target_pad = -(self.config['max_abs_value'] + 0.1) self._token_pad = 1. self.epochs = 1 self.steps = 0
def multi_task_model(self, training): from AMmodel.MultiConformer import ConformerMultiTaskCTC token1_feature = TextFeaturizer(self.config['decoder1_config']) token2_feature = TextFeaturizer(self.config['decoder2_config']) token3_feature = TextFeaturizer(self.config['decoder3_config']) self.model_config.update({ 'classes1': token1_feature.num_classes, 'classes2': token2_feature.num_classes, 'classes3': token3_feature.num_classes, }) self.model = ConformerMultiTaskCTC(self.model_config, training=training, speech_config=self.speech_config)
def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS=False self.steps = 0
def __init__(self, config): self.am = AM(config) self.am.load_model(False) self.speech_config = config['speech_config'] self.text_config = config['decoder_config'] self.speech_feature = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.decoded = tf.constant([self.text_featurizer.start])
def __init__(self, config=None,vocoder_config=None): assert config is not None or vocoder_config is not None,'must one config' if config is not None: self.config = config self.acoustic=config['model_name'] else: self.config = None self.vocoder_config=vocoder_config if vocoder_config is not None: self.GL = SpeechFeaturizer(vocoder_config).inv_mel_spectrogram self.vocoder_type=vocoder_config['vocoder_model'] else: self.GL = SpeechFeaturizer(config).inv_mel_spectrogram self.vocoder_type=None if self.config is not None: self.text_featurizer=TextFeaturizer(config)
def multi_task_model(self, training): from AMmodel.MultiConformer import ConformerMultiTaskLAS token1_feature = TextFeaturizer(self.config['decoder1_config']) token2_feature = TextFeaturizer(self.config['decoder2_config']) token3_feature = TextFeaturizer(self.config['decoder3_config']) token4_feature = TextFeaturizer(self.config['decoder4_config']) self.model_config.update({ 'classes1': token1_feature.num_classes, 'classes2': token2_feature.num_classes, 'classes3': token3_feature.num_classes, }) self.model_config['LAS_decoder'].update( {'n_classes': token4_feature.num_classes}) self.model_config['LAS_decoder'].update( {'startid': token4_feature.start}) self.model = ConformerMultiTaskLAS( self.model_config, training=training, enable_tflite_convertible=self. model_config['enable_tflite_convertible'])
def __init__(self, config): self.config = config self.update_model_type() self.speech_config = self.config['speech_config'] try: self.text_config = self.config['decoder_config'] except: self.text_config = self.config['decoder4_config'] self.model_config = self.config['model_config'] self.text_feature = TextFeaturizer(self.text_config) self.speech_feature = SpeechFeaturizer(self.speech_config) self.init_steps = None
def __init__(self, config): self.config = config self.update_model_type() self.speech_config = self.config['speech_config'] if self.model_type != 'MultiTask': self.text_config = self.config['decoder_config'] else: self.text_config = self.config['decoder3_config'] self.model_config = self.config['model_config'] self.text_feature = TextFeaturizer(self.text_config, True) self.speech_feature = SpeechFeaturizer(self.speech_config) self.init_steps = None
def __init__(self, config, punc_config=None): self.config = config self.am_featurizer = TextFeaturizer(config['am_token']) self.lm_featurizer = TextFeaturizer(config['lm_token']) self.model_config = self.config['model_config'] self.model_config.update({ 'input_vocab_size': self.am_featurizer.num_classes, 'target_vocab_size': self.lm_featurizer.num_classes }) self.punc_config = punc_config if punc_config: self.punc_vocab_featurizer = TextFeaturizer( punc_config['punc_vocab']) self.punc_bd_featurizer = TextFeaturizer( punc_config['punc_biaodian']) self.punc_model_config = self.punc_config['model_config'] self.punc_model_config.update({ 'input_vocab_size': self.punc_vocab_featurizer.num_classes, 'bd_vocab_size': self.punc_bd_featurizer.num_classes })
def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.text4_config = config_dict['decoder4_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.token4_featurizer = TextFeaturizer(self.text4_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.LAS=True self.steps = 0 self.init_bert(config_dict)
class LM_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.for_multi_task=config['am_token']['for_multi_task'] self.am_featurizer = TextFeaturizer(config['am_token']) self.lm_featurizer = TextFeaturizer(config['lm_token']) self.init_text_to_vocab() self.batch = config['running_config']['batch_size'] self.epochs = 1 self.config=config def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info('history train list not equal new load train list ,data loader use init state') self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return ( tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768]) ) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence(config['train_list'] if self.train else config['eval_list'], training=self.train) def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): if self.for_multi_task: pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True) pys = [i[0] for i in pys] return pys else: pys=pypinyin.pinyin(txt) pys=[i[0] for i in pys] return pys self.text_to_vocab = text_to_vocab_func def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_list, self.test_list = train, test self.train_offset=0 self.test_offset=0 else: self.test_texts = txts self.offset = 0 def preprocess(self, tokens, txts): x = [] y = [] new = [] for token, txt in zip(tokens, txts): # print(py,txt) if not self.check_valid(token, self.am_featurizer.vocab_array): logging.info('{} pinyin not all in token,continue'.format(txt)) continue if not self.check_valid(txt, self.lm_featurizer.vocab_array): logging.info('{} not all in token,continue'.format(txt)) continue # try: x_ = [self.am_featurizer.startid()] y_ = [self.lm_featurizer.startid()] for i in token: x_.append(self.am_featurizer.token_to_index[i]) for i in txt: y_.append(self.lm_featurizer.token_to_index[i]) x_.append(self.am_featurizer.endid()) y_.append(self.lm_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) new.append(txt) return x, y, new def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences(bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences(bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def generate(self, train=True): samples=[] x = [] y = [] for i in range(self.batch*10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 txt=line.strip() txt=txt.replace(' ','') if self.config['only_chinese']: txt=self.only_chinese(txt) py=self.text_to_vocab(txt) if self.check_valid(py, self.am_featurizer.vocab_array) is not True: logging.info('{} pinyin {} not in token,skip'.format(txt,self.check_valid(py, self.am_featurizer.vocab_array))) continue if self.check_valid(txt, self.lm_featurizer.vocab_array) is not True: logging.info('{} txt {} not in token,skip'.format(txt,self.check_valid(txt, self.lm_featurizer.vocab_array))) continue x_ = [self.am_featurizer.startid()] y_ = [self.lm_featurizer.startid()] for i in py: x_.append(self.am_featurizer.token_to_index[i]) for i in txt: y_.append(self.lm_featurizer.token_to_index[i]) x_.append(self.am_featurizer.endid()) y_.append(self.lm_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) samples.append(txt) if len(samples)==self.batch: break e_bert_t, e_bert_s = self.bert_decode(samples) e_features = self.get_bert_feature(e_bert_t, e_bert_s) x = self.pad(x) y = self.pad(y) e_features = self.pad(e_features, 2) x = np.array(x) y = np.array(y) e_features = np.array(e_features, dtype='float32') return x, y, e_features def eval_generate(self, ): samples = [] x = [] y = [] for i in range(self.batch * 10): line = self.test_texts[self.offset] self.offset += 1 if self.offset > len(self.test_texts) - 1: self.offset = 0 txt = line.strip() txt = txt.replace(' ', '') if self.config['only_chinese']: txt = self.only_chinese(txt) py = self.text_to_vocab(txt) if self.check_valid(py, self.am_featurizer.vocab_array) is not True: logging.info('{} pinyin {} not in token,skip'.format(txt, self.check_valid(py, self.am_featurizer.vocab_array))) continue if self.check_valid(txt, self.lm_featurizer.vocab_array) is not True: logging.info('{} txt {} not in token,skip'.format(txt, self.check_valid(txt, self.lm_featurizer.vocab_array))) continue x_ = [self.am_featurizer.startid()] y_ = [self.lm_featurizer.startid()] for i in py: x_.append(self.am_featurizer.token_to_index[i]) for i in txt: y_.append(self.lm_featurizer.token_to_index[i]) x_.append(self.am_featurizer.endid()) y_.append(self.lm_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) samples.append(txt) if len(samples) == self.batch: break x = self.pad(x) y = self.pad(y) x = np.array(x, 'int32') y = np.array(y, 'int32') return x, y def generator(self, train=True): while 1: x, y, features = self.generate(train) if len(x) == 0: logging.info('load data length zero,continue') continue yield x, y, features
class AM_DataLoader(): def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS=False self.steps = 0 def load_state(self,outdir): try: self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist() self.epochs=1+int(np.mean(self.pick_index)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self,outdir): np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index)) def return_data_types(self): if self.LAS: return (tf.float32, tf.float32, tf.int32, tf.int32, tf.int32,tf.float32) else: return (tf.float32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f,c=self.speech_featurizer.compute_feature_dim() if self.LAS: return ( tf.TensorShape([None,None,1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None,None,f,c]), tf.TensorShape([None,]), tf.TensorShape([None,None]), tf.TensorShape([None,]), tf.TensorShape([None,None,None]) ) else: return ( tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape( [None, None, f, c]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]) ) def get_per_epoch_steps(self): return len(self.train_list)//self.batch def eval_per_epoch_steps(self): return len(self.test_list)//self.batch def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins=pypinyin.pinyin(txt) pins=[i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func def augment_data(self, wavs, label, label_length): if not self.augment.available(): return None mels = [] input_length = [] label_ = [] label_length_ = [] wavs_ = [] max_input = 0 max_wav = 0 for idx, wav in enumerate(wavs): data = self.augment.process(wav.flatten()) speech_feature = self.speech_featurizer.extract(data) if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]: continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) wavs_.append(data) mels.append(speech_feature) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) label_.append(label[idx]) label_length_.append(label_length[idx]) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav) x = np.array(mels, 'float32') label_ = np.array(label_, 'int32') input_length = np.array(input_length, 'int32') label_length_ = np.array(label_length_, 'int32') wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32') return x, wavs_, input_length, label_, label_length_ def make_file_list(self, wav_list,training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data=[i.strip() for i in data if i!=''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.pick_index = [0.] * len(self.train_list) else: self.test_list=data self.offset=0 def only_chinese(self, word): txt='' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt+=ch else: continue return txt def eval_data_generator(self): sample=self.test_list[self.offset:self.offset+self.batch] self.offset+=self.batch speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') txt=txt.replace(' ','') try: data = self.speech_featurizer.load_wav(wp) except: print('{} load data failed'.format(wp)) continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: print('{} duration out of wav_max_duration({})'.format(wp,self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt= self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) max_input = max(max_input, speech_feature.shape[0]) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): print(' {} txt pinyin {} not all in tokens,continue'.format(txt,py)) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): print('{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal(speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2]]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def check_valid(self,txt,vocab_list): if len(txt)==0: return False for n in txt: if n in vocab_list: pass else: return False return True def GuidedAttentionMatrix(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttentionMatrix(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): if train: batch=self.batch if self.augment.available() else self.batch*2 indexs = np.argsort(self.pick_index)[:batch] indexs = random.sample(indexs.tolist(), batch//2) sample = [self.train_list[i] for i in indexs] for i in indexs: self.pick_index[int(i)] += 1 self.epochs =1+ int(np.mean(self.pick_index)) else: sample = random.sample(self.test_list, self.batch) speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('{} load data failed'.format(wp)) continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: print('{} duration out of wav_max_duration({})'.format(wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt= self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py,self.text_featurizer.vocab_array): print(' {} txt pinyin {} not all in tokens,continue'.format(txt, py)) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): print('{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input,len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal(speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2]]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0])*self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def generator(self,train=True): while 1: x, input_length, labels, label_length=self.generate(train) if x.shape[0]==0: print('load data length zero,continue') continue if self.LAS: guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length), label_length.max()) yield x, input_length, labels, label_length,guide_matrix else: yield x, input_length, labels, label_length
class TacotronDataLoader(): def __init__(self, config, training=True): self.speech_featurizer = SpeechFeaturizer(config) self.text_featurizer = TextFeaturizer(config) self.config = config self.batch = config['batch_size'] self.make_file_list(self.config['train_list'] if training else self.config['eval_list'], training=training) self.min_value = -self.config['max_abs_value'] self._target_pad = -(self.config['max_abs_value'] + 0.1) self._token_pad = 1. self.epochs = 1 self.steps = 0 def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.95)] self.test_list = data[int(num * 0.95):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('load train list {} test list{}'.format( len(self.train_list), len(self.test_list))) if self.config['balance_spk_utts']: spk_utt = {} for line in self.train_list: a, b, c = line.strip().split('\t') if c in spk_utt: spk_utt[c].append(line) else: spk_utt[c] = [line] maxlen = max([len(spk_utt[i]) for i in spk_utt]) self.train_list = [] for key in spk_utt: datas = spk_utt[key] if len(datas) < maxlen: factor = int(np.rint(maxlen / len(datas))) else: factor = 1 datas *= factor self.train_list += datas np.random.shuffle(self.train_list) logging.info('balance spk utts: train list {}'.format( len(self.train_list))) else: self.test_list = data self.offset = 0 def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def return_data_types(self): #charactor, char_length, mel, mel_length, stop_gts, speaker, guided_attention return (tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.int32, tf.float32) def return_data_shape(self): # charactor, char_length, mel, mel_length, stop_gts, speaker, guided_attention return ( tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None, self.config['num_mels']]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, None]), ) def GuidedAttention(self, N, T, g=0.5): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def make_Att_targets(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] att_mask = [] mel_target_shape //= self.config['outputs_per_step'] for i, j in zip(input_length, targets_length): # i=inputs_shape step = int(j / self.config['outputs_per_step']) pad = np.zeros([inputs_shape, mel_target_shape]) pad[i:, :step] = 1 maskpad = np.zeros([inputs_shape, mel_target_shape]) maskpad[:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.1) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_mask.append(maskpad) att_targets = np.array(att_targets) att_mask = np.array(att_mask) return att_targets.astype('float32'), att_mask.astype('float32') def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def extractor(self, ): data = self.train_list + self.test_list while self.train_offset < len(data): charactor, char_length, mel, mel_length, speaker = [], [], [], [], [] audios = [] names = [] for i in range(self.batch * 10): line = data[self.train_offset] self.train_offset += 1 if self.train_offset > len(data) - 1: break wav_path, text, spkid = line.strip().split('\t') try: wav = self.speech_featurizer.load_wav(wav_path) target_mel = self.speech_featurizer.melspectrogram(wav) # print(target_mel.shape) except: logging.info('{} load data failed , skip'.format(wav_path)) continue try: text_tokens = self.text_featurizer.extract(text) except: logging.info('{} to token failed,skip'.format(text)) continue try: speaker_id = self.text_featurizer.spker_map[spkid] except: logging.info('{} not in spk map,skip'.format(spkid)) continue audios.append(wav) names.append(os.path.split(wav_path)[-1].replace('.wav', '')) charactor.append(np.array(text_tokens)) char_length.append(len(text_tokens)) mel.append(target_mel) mel_length.append(len(target_mel)) speaker.append([speaker_id]) if len(charactor) == self.batch: break output_per_step = self.config['outputs_per_step'] charactor = self._prepare_inputs(charactor) char_length = np.array(char_length, 'int32') mel = self._prepare_targets(mel, output_per_step) mel_length = np.array(mel_length, 'int32') speaker = np.array(speaker, 'int32') T = mel.shape[1] * self.speech_featurizer.hop_size audios = tf.keras.preprocessing.sequence.pad_sequences( audios, T, 'float32', 'post', 'post') yield charactor, char_length, mel, mel_length, speaker, audios, names def generate(self, train=True): charactor, char_length, mel, mel_length, stop_gts, speaker=[],[],[],[],[],[] for i in range(self.batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wav_path, text, spkid = line.strip().split('\t') try: wav = self.speech_featurizer.load_wav(wav_path) target_mel = self.speech_featurizer.melspectrogram(wav) # print(target_mel.shape) except: logging.info('{} load data failed , skip'.format(wav_path)) continue try: text_tokens = self.text_featurizer.extract(text) except: logging.info('{} to token failed,skip'.format(text)) continue try: speaker_id = self.text_featurizer.spker_map[spkid] except: logging.info('{} not in spk map,skip'.format(spkid)) continue token_target = np.asarray([0.] * (len(target_mel) - 1)) charactor.append(np.array(text_tokens)) char_length.append(len(text_tokens)) mel.append(target_mel) mel_length.append(len(target_mel)) stop_gts.append(token_target) speaker.append([speaker_id]) if len(charactor) == self.batch: break output_per_step = self.config['outputs_per_step'] charactor = self._prepare_inputs(charactor) char_length = np.array(char_length, 'int32') mel = self._prepare_targets(mel, output_per_step) mel_length = np.array(mel_length, 'int32') stop_gts = self._prepare_token_targets(stop_gts, output_per_step) speaker = np.array(speaker, 'int32') return charactor, char_length, mel, mel_length, stop_gts, speaker def _prepare_inputs(self, inputs): max_len = max([len(x) for x in inputs]) return np.stack([self._pad_input(x, max_len) for x in inputs]) def _prepare_targets(self, targets, alignment): max_len = max([len(t) for t in targets]) return np.stack([ self._pad_target(t, self._round_up(max_len, alignment)) for t in targets ]) def _prepare_token_targets(self, targets, alignment): max_len = max([len(t) for t in targets]) + 1 return np.stack([ self._pad_token_target(t, self._round_up(max_len, alignment)) for t in targets ]) def _pad_input(self, x, length): return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self.text_featurizer.pad) def _pad_target(self, t, length): return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) def _pad_token_target(self, t, length): return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad) def _round_down(self, x, multiple): remainder = x % multiple return x if remainder == 0 else x - remainder def _round_up(self, x, multiple): remainder = x % multiple return x if remainder == 0 else x + multiple - remainder def generator(self, train=True): while 1: charactor, char_length, mel, mel_length, stop_gts, speaker = self.generate( train) if charactor.shape[0] == 0: logging.info('load data length zero,continue') continue guide_matrix, _ = self.make_Att_targets(char_length, mel_length, np.max(char_length), np.max(mel_length)) yield charactor.astype('int32'), char_length.astype('int32'), mel.astype('float32'), mel_length.astype('int32'), stop_gts.astype('float32'), speaker.astype('int32')\ ,guide_matrix.astype('float32')
outputs=self.out_cnn(fc_outputs,training=training) block_outputs=self.block(fc_outputs,training=training) outputs+=self.out_cnn(block_outputs,training=training) return outputs if __name__ == '__main__': from utils.user_config import UserConfig from utils.text_featurizers import TextFeaturizer from utils.speech_featurizers import SpeechFeaturizer import os import time os.environ['CUDA_VISIBLE_DEVICES']='1' config=UserConfig(r'D:\TF2-ASR\configs\am_data.yml',r'D:\TF2-ASR\configs\conformer.yml') config['decoder_config'].update({'model_type':'LAS'}) Tfer=TextFeaturizer(config['decoder_config']) SFer=SpeechFeaturizer(config['speech_config']) f,c=SFer.compute_feature_dim() config['model_config']['LAS_decoder'].update({'n_classes': Tfer.num_classes}) config['model_config']['LAS_decoder'].update({'startid': Tfer.start}) ct=ConformerLAS(config['model_config'],training=False) # ct.add_featurizers(Tfer) x=tf.ones([1,300,f,c]) length=tf.constant([300]) out=ct._build([1,300,f,c],training=True) ct.inference(x,length//4) s=time.time() a=ct.inference(x,length//4) e=time.time() print(e-s,a)
class Punc_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['punc_vocab']) self.bd_featurizer = TextFeaturizer(config['punc_biaodian']) self.bd = self.bd_featurizer.vocab_array self.batch = config['running_config']['batch_size'] self.epochs = 1 def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return (tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768])) def get_per_epoch_steps(self): return len(self.train_texts) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence( config['train_list'] if self.train else config['eval_list'], training=self.train) def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_list, self.test_list = train, test self.train_offset = 0 self.test_offset = 0 else: self.test_texts = txts self.offset = 0 def preprocess(self, txts): x = [] for txt in txts: x_ = [self.vocab_featurizer.startid()] for i in txt: x_.append(self.vocab_featurizer.token_to_index[i]) x_.append(self.vocab_featurizer.endid()) x.append(np.array(x_)) return x def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) elif mode == 3: for i in range(len(x)): pading = np.zeros([length - len(x[i]), x[i].shape[1]]) x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences( bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences( bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def get_target(self, text): bd = self.bd zh = [] bd_ = [[0]] for n in text: if n in bd: bd_[-1].append(bd.index(n)) else: zh.append(n) bd_.append([0]) zh_txt = ''.join(zh) bd_txt = bd_ + [[0]] return zh_txt, bd_txt def process_punc(self, puncs): x = [] for punc in puncs: x_ = [] for i in range(len(punc)): if len(punc[i]) == 1: x_ += [1] else: x_ += punc[i][-1:] x.append(np.array(x_, 'int32')) return x def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def generate(self, train): trainx = [] trainy = [] for i in range(self.batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 line = line.strip() if len(line) < 30: extra = random.sample(self.train_list, 1)[0] extra = extra.strip() line += extra if self.check_valid(line, self.vocab_featurizer.vocab_array + self.bd) is not True: continue try: x, y = self.get_target(line) except: continue trainx.append(x) trainy.append(y) if len(trainx) == self.batch: break inp_tokens = self.preprocess(trainx) e_bert_t, e_bert_s = self.bert_decode(trainx) e_features = self.get_bert_feature(e_bert_t, e_bert_s) trainy = self.process_punc(trainy) inp_tokens = self.pad(inp_tokens) trainy = self.pad(trainy) e_features = self.pad(e_features, 2) inp_tokens = np.array(inp_tokens) trainy = np.array(trainy) e_features = np.array(e_features, dtype='float32') return inp_tokens, trainy, e_features def generator(self, train=True): while 1: x, y, features = self.generate(train) if x.shape[1] != y.shape[1] and y.shape[1] != features.shape[1]: logging.info('bad batch,skip') continue yield x, y, features
class TTSmodel(): def __init__(self, config=None,vocoder_config=None): assert config is not None or vocoder_config is not None,'must one config' if config is not None: self.config = config self.acoustic=config['model_name'] else: self.config = None self.vocoder_config=vocoder_config if vocoder_config is not None: self.GL = SpeechFeaturizer(vocoder_config).inv_mel_spectrogram self.vocoder_type=vocoder_config['vocoder_model'] else: self.GL = SpeechFeaturizer(config).inv_mel_spectrogram self.vocoder_type=None if self.config is not None: self.text_featurizer=TextFeaturizer(config) def load_model(self,training=True): if self.config is not None: if self.acoustic=='Tacotron2': self.config['vocab_size']=self.text_featurizer.num_classes tac_config=Tacotron2Config(**self.config) self.acoustic_model=TFTacotron2(tac_config,training) elif self.acoustic=='FastSpeech': self.config['vocab_size']=self.text_featurizer.num_classes fast_config=FastSpeechConfig(**self.config) self.acoustic_model=TFFastSpeech(fast_config) if self.vocoder_config is not None: if self.vocoder_type =='MelGan': melgan_config=MelGANGeneratorConfig(**self.vocoder_config) self.vocoder=TFMelGANGenerator(melgan_config) elif self.vocoder_type=='MultiGen': multi_config=MultiGeneratorConfig(**self.vocoder_config) self.vocoder=TFMultiWindowGenerator(multi_config) else: raise ValueError('vocoder type not support.') if training and self.vocoder_type is not None: if self.vocoder_config['use_gan']: self.discriminator=TFMelGANMultiScaleDiscriminator(MelGANDiscriminatorConfig(**self.vocoder_config)) if not training: assert self.config is not None self.acoustic_model._build() if self.vocoder_config is not None: self.vocoder._build() self.load_checkpoint() def load_checkpoint(self,): """Load checkpoint.""" self.checkpoint_dir = os.path.join(self.config["outdir"], "checkpoints") files = os.listdir(self.checkpoint_dir) files.sort(key=lambda x: int(x.split('_')[-1].replace('.h5', ''))) self.acoustic_model.load_weights(os.path.join(self.checkpoint_dir, files[-1])) logging.info('acoustic load model at {}'.format(os.path.join(self.checkpoint_dir, files[-1]))) if self.vocoder_config is not None: self.checkpoint_dir = os.path.join(self.vocoder_config["outdir"], "checkpoints") files = os.listdir(self.checkpoint_dir) files= [i for i in files if 'g' in i] files.sort(key=lambda x: int(x.split('_')[-1].replace('.h5', ''))) self.vocoder.load_weights(os.path.join(self.checkpoint_dir, files[-1])) logging.info('vocoder load model at {}'.format(os.path.join(self.checkpoint_dir, files[-1]))) def synthesize(self,text,spk): if self.config['model_name']=='Tacotron2': inp=self.text_featurizer.extract(text) input_length=len(inp) spk_id=self.text_featurizer.spker_map[spk] inp=np.array(inp,'int32').reshape([1,-1]) input_length=np.array(input_length,'int32').reshape([1]) spk_id=np.array([spk_id,0],'int32').reshape([1,-1]) decoder_output, mel_outputs, stop_token_prediction, alignment_history=self.acoustic_model.inference(input_ids=inp, input_lengths=input_length, speaker_ids=spk_id, use_window_mask=False, win_front=5, win_back=5, maximum_iterations=100, ) else: inp = self.text_featurizer.extract(text) spk_id = self.text_featurizer.spker_map[spk] inp = np.array(inp, 'int32').reshape([1, -1]) spk_id = np.array(spk_id, 'int32').reshape([1, 1]) decoder_output, mel_outputs,duration_pred=self.acoustic_model.inference(inp,tf.math.not_equal(inp, 0),spk_id) if self.vocoder_config is not None: wav=self.vocoder(mel_outputs) wav=wav[0].numpy().flatten() else: wav=self.GL(mel_outputs[0].numpy().T) return wav
class MultiTask_DataLoader(): def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.text4_config = config_dict['decoder4_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.token4_featurizer = TextFeaturizer(self.text4_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.LAS=True self.steps = 0 self.init_bert(config_dict) def load_state(self,outdir): try: self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist() self.epochs=1+int(np.mean(self.pick_index)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self,outdir): np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index)) def load_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def init_bert(self,config): bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.load_bert(bert_config, bert_checkpoint) def bert_decode(self, x): tokens, segs = [], [] for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def get_bert_feature(self, bert_t, bert_s): f = [] for t, s in zip(bert_t, bert_s): t = np.expand_dims(np.array(t), 0) s = np.expand_dims(np.array(s), 0) feature = self.bert.predict([t, s]) f.append(feature[0]) return f[0][1:] def return_data_types(self): return (tf.float32, tf.float32, tf.float32,tf.int32, tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32, tf.int32,tf.float32) def return_data_shape(self): f,c=self.speech_featurizer.compute_feature_dim() return ( tf.TensorShape([None,None,f,c]), tf.TensorShape([None,None,1]), tf.TensorShape([None, None, 768]), tf.TensorShape([None,]), tf.TensorShape([None,None]), tf.TensorShape([None,]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None,None,None]) ) def get_per_epoch_steps(self): return len(self.train_list)//self.batch def eval_per_epoch_steps(self): return len(self.test_list)//self.batch def make_maps(self,config): with open(config['map_path']['pinyin'],encoding='utf-8') as f: data=f.readlines() data=[i.strip() for i in data if i!=''] self.py_map={} for line in data: key,py=line.strip().split('\t') self.py_map[key]=py if len(py.split(' '))>1: for i,j in zip(list(key),py.split(' ')): self.py_map[i]=j with open(config['map_path']['phone'],encoding='utf-8') as f: data=f.readlines() data=[i.strip() for i in data if i!=''] self.phone_map={} phone_map={} for line in data: key,py=line.strip().split('\t') phone_map[key]=py for key in self.py_map.keys(): key_py=self.py_map[key] if len(key)>1: phone=[] for n in key_py.split(' '): phone+=[phone_map[n]] self.phone_map[key]=' '.join(phone) else: self.phone_map[key]=phone_map[self.py_map[key]] def map(self,txt): cut=lcut(txt) pys=[] phones=[] words=[] for i in cut: word=i.word if word in self.py_map.keys(): py=self.py_map[word] phone=self.phone_map[word] pys+=py.split(' ') phones+=phone.split(' ') words+=list(''.join(py.split(' '))) else: for j in word: pys+=[self.py_map[j]] phones+=self.phone_map[j].split(' ') words+=list(''.join(self.py_map[j])) return pys,phones,words def augment_data(self, wavs, label, label_length): if not self.augment.available(): return None mels = [] input_length = [] label_ = [] label_length_ = [] wavs_ = [] max_input = 0 max_wav = 0 for idx, wav in enumerate(wavs): data = self.augment.process(wav.flatten()) speech_feature = self.speech_featurizer.extract(data) if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]: continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) wavs_.append(data) mels.append(speech_feature) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) label_.append(label[idx]) label_length_.append(label_length[idx]) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav) x = np.array(mels, 'float32') label_ = np.array(label_, 'int32') input_length = np.array(input_length, 'int32') label_length_ = np.array(label_length_, 'int32') wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32') return x, wavs_, input_length, label_, label_length_ def make_file_list(self, wav_list,training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data=[i.strip() for i in data if i!=''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.pick_index = [0.] * len(self.train_list) else: self.test_list=data self.offset=0 def only_chinese(self, word): for ch in word: if '\u4e00' <= ch <= '\u9fff': pass else: return False return True def eval_data_generator(self): sample=self.test_list[self.offset:self.offset+self.batch] self.offset+=self.batch mels = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] txt_label = [] txt_label_length = [] bert_features=[] wavs = [] max_wav = 0 max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 max_label_txt = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue speech_feature = self.speech_featurizer.extract(data) max_input = max(max_input, speech_feature.shape[0]) py,phone,word = self.map(txt) if len(py) == 0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) max_wav = max(max_wav, len(data)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature): continue mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) for i in range(len(bert_features)): if bert_features[i].shape[0] < max_label_txt: pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10. bert_features[i] = np.vstack((bert_features[i], pading)) wavs = self.speech_featurizer.pad_signal(wavs, max_wav) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) txt_label = self.pad(txt_label, max_label_txt) x = np.array(mels, 'float32') bert_features = np.array(bert_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') txt_label = np.array(txt_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') txt_label_length = np.array(txt_label_length, 'int32') wavs = np.array(np.expand_dims(wavs, -1), 'float32') return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length def pad(self,words_label,max_label_words): for i in range(len(words_label)): if words_label[i].shape[0] < max_label_words: pad = np.ones(max_label_words - words_label[i].shape[0]) * self.token1_featurizer.pad words_label[i] = np.hstack((words_label[i], pad)) return words_label def GuidedAttention(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): if train: batch=self.batch if self.augment.available() else self.batch*2 indexs = np.argsort(self.pick_index)[:batch] indexs = random.sample(indexs.tolist(), batch//2) sample = [self.train_list[i] for i in indexs] for i in indexs: self.pick_index[int(i)] += 1 self.epochs = 1+int(np.mean(self.pick_index)) else: sample = random.sample(self.test_list, self.batch) mels = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] txt_label = [] txt_label_length = [] bert_features = [] wavs = [] max_wav = 0 max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 max_label_txt = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue speech_feature = self.speech_featurizer.extract(data) py, phone, word = self.map(txt) if len(py) == 0 or len(phone)==0 or len(word)==0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature): continue max_input = max(max_input, speech_feature.shape[0]) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) max_wav = max(max_wav, len(data)) mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue data=self.augment.process(data) speech_feature = self.speech_featurizer.extract(data) py, phone, word = self.map(txt) if len(py) == 0 or len(phone) == 0 or len(word) == 0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature): continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) for i in range(len(bert_features)): if bert_features[i].shape[0]<max_label_txt: pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10. bert_features[i] = np.vstack((bert_features[i], pading)) wavs = self.speech_featurizer.pad_signal(wavs, max_wav) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) txt_label = self.pad(txt_label, max_label_txt) x = np.array(mels, 'float32') bert_features = np.array(bert_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') txt_label = np.array(txt_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') txt_label_length = np.array(txt_label_length, 'int32') wavs = np.array(np.expand_dims(wavs, -1), 'float32') return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length def generator(self,train=True): while 1: x, wavs,bert_feature, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length=self.generate(train) guide_matrix = self.guided_attention(input_length, txt_label_length, np.max(input_length), txt_label_length.max()) yield x, wavs, bert_feature,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length,guide_matrix
class AM_DataLoader(): def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.streaming = self.speech_config['streaming'] self.chunk = self.speech_config['sample_rate'] * self.speech_config[ 'streaming_bucket'] self.batch = config_dict['learning_config']['running_config'][ 'batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list( self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS = False self.steps = 0 def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): if self.LAS: return (tf.float32, tf.int32, tf.int32, tf.int32, tf.float32) else: return (tf.float32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f, c = self.speech_featurizer.compute_feature_dim() if self.LAS: return (tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None, None])) else: return (tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ])) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def init_text_to_vocab(self): pypinyin.load_phrases_dict({ '调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins = pypinyin.pinyin(txt) pins = [i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('load train list {} test list{}'.format( len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def eval_data_generator(self): sample = self.test_list[self.offset:self.offset + self.batch] self.offset += self.batch speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') txt = txt.replace(' ', '') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: logging.info('{} wav too short < 25ms,skip'.format(wp)) continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}) ,skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * ( self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) max_input = max(max_input, speech_feature.shape[0]) py = self.text_to_vocab(txt) if self.check_valid(py, self.text_featurizer.vocab_array) is not True: logging.info(' {} txt pinyin {} not all in tokens,skip'.format( txt, self.check_valid(py, self.text_featurizer.vocab_array))) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): logging.info( '{} feature length < pinyin length,skip'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: if self.streaming: max_input = max_input // self.chunk * self.chunk + self.chunk speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def GuidedAttentionMatrix(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttentionMatrix(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): sample = [] speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 if train: batch = self.batch // 2 if self.augment.available() else self.batch else: batch = self.batch for i in range(batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wp, txt = line.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if self.check_valid(py, self.text_featurizer.vocab_array) is not True: logging.info( ' {} txt pinyin {} not all in tokens,continue'.format( txt, self.check_valid(py, self.text_featurizer.vocab_array))) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): logging.info( '{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) sample.append(line) if len(sample) == batch: break if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: continue if len(data) < 400: logging.info('{} wav too short < 25ms,skip'.format(wp)) continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: if self.streaming: reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] max_input = max_input // self.chunk * self.chunk + self.chunk max_in_len = max_input // self.chunk chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 max_in_len *= chunk_times input_length = np.clip(input_length, 0, max_in_len) speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def generator(self, train=True): while 1: x, input_length, labels, label_length = self.generate(train) if x.shape[0] == 0: logging.info('load data length zero,continue') continue if self.LAS: guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length), label_length.max()) yield x, input_length, labels, label_length, guide_matrix else: yield x, input_length, labels, label_length
_cond, _body, loop_vars=(b_i, B, stop_flag, decoded), shape_invariants=(tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None, None]))) return decoded if __name__ == '__main__': from utils.user_config import UserConfig from utils.text_featurizers import TextFeaturizer import time config = UserConfig(r'D:\TF2-ASR\configs\lm_data.yml', r'D:\TF2-ASR\configs\transformer.yml') vocab_featurizer = TextFeaturizer(config['lm_vocab']) word_featurizer = TextFeaturizer(config['lm_word']) model_config = config['model_config'] model_config.update({ 'input_vocab_size': vocab_featurizer.num_classes, 'target_vocab_size': word_featurizer.num_classes }) model = Transformer(**model_config) model._build() model.recognize(np.ones([2, 10])) s = time.time() c = model.recognize(np.ones([2, 10])) e = time.time() print(c, e - s)
class LM_DataLoader(): def __init__(self, config, training=True): self.train = training self.init_all(config) self.vocab_featurizer = TextFeaturizer(config['lm_vocab']) self.word_featurizer = TextFeaturizer(config['lm_word']) self.init_text_to_vocab() self.batch = config['running_config']['batch_size'] self.epochs = 1 def init_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def load_state(self, outdir): try: self.train_pick = np.load(os.path.join( outdir, 'dg_state.npy')).flatten().tolist() self.epochs = 1 + int(np.mean(self.train_pick)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self, outdir): np.save(os.path.join(outdir, 'dg_state.npy'), np.array(self.train_pick)) def return_data_types(self): return (tf.int32, tf.int32, tf.float32) def return_data_shape(self): return (tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, 768])) def get_per_epoch_steps(self): return len(self.train_texts) // self.batch def eval_per_epoch_steps(self): return len(self.test_texts) // self.batch def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence( config['train_list'] if self.train else config['eval_list'], training=self.train) def init_text_to_vocab(self): pypinyin.load_phrases_dict({ '调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): return pypinyin.lazy_pinyin(txt, 1, errors='ignore') self.text_to_vocab = text_to_vocab_func def get_sentence(self, data_path, training): from tqdm import tqdm with open(data_path, encoding='utf-8') as f: data = f.readlines() txts = [] for txt in tqdm(data): txt = txt.strip() if len(txt) > 150: continue txts.append(txt) if training: num = len(txts) train = txts[:int(num * 0.99)] test = txts[int(num * 0.99):] self.train_texts, self.test_texts = train, test self.train_pick = [0] * len(self.train_texts) else: self.test_texts = txts self.offset = 0 def preprocess(self, tokens, txts): x = [] y = [] new = [] for token, txt in zip(tokens, txts): # print(py,txt) if not self.check_valid(token, self.vocab_featurizer.vocab_array): continue if not self.check_valid(txt, self.word_featurizer.vocab_array): continue # try: x_ = [self.vocab_featurizer.startid()] y_ = [self.word_featurizer.startid()] for i in token: x_.append(self.vocab_featurizer.token_to_index[i]) for i in txt: y_.append(self.word_featurizer.token_to_index[i]) x_.append(self.vocab_featurizer.endid()) y_.append(self.word_featurizer.endid()) x.append(np.array(x_)) y.append(np.array(y_)) new.append(txt) return x, y, new def bert_decode(self, x, x2=None): tokens, segs = [], [] if x2 is not None: for i, j in zip(x, x2): t, s = self.bert_token.encode(''.join(i)) index = np.where(j == 2)[0] if len(index) > 0: for n in index: t[int(n)] = 103 tokens.append(t) segs.append(s) else: for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def pad(self, x, mode=1): length = 0 for i in x: length = max(length, len(i)) if mode == 2: for i in range(len(x)): pading = np.ones([length - len(x[i]), x[i].shape[1]]) * -10. x[i] = np.vstack((x[i], pading)) else: x = pad_sequences(x, length, padding='post', truncating='post') return x def get_bert_feature(self, bert_t, bert_s): length = [len(i) for i in bert_t] max_len = max(length) bert_s = tf.keras.preprocessing.sequence.pad_sequences( bert_s, max_len, padding='post', truncating='post') bert_t = tf.keras.preprocessing.sequence.pad_sequences( bert_t, max_len, padding='post', truncating='post') features = self.bert.predict([bert_t, bert_s]) for idx, l in enumerate(length): features[idx, l:] = -10. return features def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return False return True def generate(self, train=True): if train: indexs = np.argsort(self.train_pick)[:2 * self.batch] indexs = random.sample(indexs.tolist(), self.batch) sample = [self.train_texts[i] for i in indexs] for i in indexs: self.train_pick[int(i)] += 1 self.epochs = 1 + int(np.mean(self.train_pick)) else: sample = random.sample(self.test_texts, self.batch) trainx = [self.text_to_vocab(i) for i in sample] trainy = sample x, y, new = self.preprocess(trainx, trainy) e_bert_t, e_bert_s = self.bert_decode(new) e_features = self.get_bert_feature(e_bert_t, e_bert_s) x = self.pad(x) y = self.pad(y) e_features = self.pad(e_features, 2) x = np.array(x) y = np.array(y) e_features = np.array(e_features, dtype='float32') return x, y, e_features def eval_generate(self, ): sample = self.test_texts[self.offset:self.offset + self.batch] self.offset += self.batch trainx = [self.text_to_vocab(i) for i in sample] trainy = sample x, y, new = self.preprocess(trainx, trainy) x = self.pad(x) y = self.pad(y) x = np.array(x, 'int32') y = np.array(y, 'int32') return x, y def generator(self, train=True): while 1: x, y, features = self.generate(train) yield x, y, features
class MultiTask_DataLoader(): def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config'][ 'batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.make_file_list( self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.steps = 0 def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f, c = self.speech_featurizer.compute_feature_dim() return ( tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape( [None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), ) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def make_maps(self, config): with open(config['map_path']['phone'], encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] self.phone_map = {} phone_map = {} for line in data: try: key, phone = line.strip().split('\t') except: continue phone_map[key] = phone.split(' ') self.phone_map = phone_map def map(self, txt): pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True) pys = [i[0] for i in pys] phones = [] for i in pys: phones += self.phone_map[i] words = ''.join(pys) words = list(words) return pys, phones, words def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('train list : {} test list:{}'.format( len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 logging.info('eval list: {}'.format(len(self.test_list))) def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def eval_data_generator(self): sample = self.test_list[self.offset:self.offset + self.batch] self.offset += self.batch speech_features = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: continue if not self.check_valid(word, self.token1_featurizer.vocab_array): logging.info( ' {} txt word {} not all in tokens,continue'.format( txt, py)) continue if not self.check_valid(phone, self.token1_featurizer.vocab_array): logging.info( ' {} txt phone {} not all in tokens,continue'.format( txt, py)) continue if not self.check_valid(py, self.token1_featurizer.vocab_array): logging.info( ' {} txt pinyin {} not all in tokens,continue'.format( txt, py)) continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) speech_features = np.array(speech_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length def pad(self, words_label, max_label_words): for i in range(len(words_label)): if words_label[i].shape[0] < max_label_words: pad = np.ones(max_label_words - words_label[i].shape[0] ) * self.token1_featurizer.pad words_label[i] = np.hstack((words_label[i], pad)) return words_label def GuidedAttention(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): sample = [] speech_features = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 if train: batch = self.batch // 2 if self.augment.available() else self.batch else: batch = self.batch for i in range(batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wp, txt = line.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: logging.info('py length', len(py), 'skip') continue if self.check_valid( word, self.token1_featurizer.vocab_array) is not True: logging.info( ' {} txt word {} not all in tokens,continue'.format( txt, self.check_valid(word, self.token1_featurizer.vocab_array))) continue # if self.check_valid( phone, self.token2_featurizer.vocab_array) is not True: logging.info( ' {} txt phone {} not all in tokens,continue'.format( txt, self.check_valid(phone, self.token2_featurizer.vocab_array))) continue # if self.check_valid( py, self.token3_featurizer.vocab_array) is not True: logging.info(' {} txt py {} not all in tokens,continue'.format( txt, self.check_valid(py, self.token3_featurizer.vocab_array))) continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) sample.append(line) if len(sample) == batch: break if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) speech_features = np.array(speech_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length def generator(self, train=True): while 1: speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length = self.generate( train) yield speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length