def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS=False self.steps = 0
def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.steps = 0
class AM_DataLoader(): def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS=False self.steps = 0 def load_state(self,outdir): try: self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist() self.epochs=1+int(np.mean(self.pick_index)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self,outdir): np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index)) def return_data_types(self): if self.LAS: return (tf.float32, tf.float32, tf.int32, tf.int32, tf.int32,tf.float32) else: return (tf.float32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f,c=self.speech_featurizer.compute_feature_dim() if self.LAS: return ( tf.TensorShape([None,None,1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None,None,f,c]), tf.TensorShape([None,]), tf.TensorShape([None,None]), tf.TensorShape([None,]), tf.TensorShape([None,None,None]) ) else: return ( tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape( [None, None, f, c]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]) ) def get_per_epoch_steps(self): return len(self.train_list)//self.batch def eval_per_epoch_steps(self): return len(self.test_list)//self.batch def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins=pypinyin.pinyin(txt) pins=[i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func def augment_data(self, wavs, label, label_length): if not self.augment.available(): return None mels = [] input_length = [] label_ = [] label_length_ = [] wavs_ = [] max_input = 0 max_wav = 0 for idx, wav in enumerate(wavs): data = self.augment.process(wav.flatten()) speech_feature = self.speech_featurizer.extract(data) if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]: continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) wavs_.append(data) mels.append(speech_feature) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) label_.append(label[idx]) label_length_.append(label_length[idx]) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav) x = np.array(mels, 'float32') label_ = np.array(label_, 'int32') input_length = np.array(input_length, 'int32') label_length_ = np.array(label_length_, 'int32') wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32') return x, wavs_, input_length, label_, label_length_ def make_file_list(self, wav_list,training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data=[i.strip() for i in data if i!=''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.pick_index = [0.] * len(self.train_list) else: self.test_list=data self.offset=0 def only_chinese(self, word): txt='' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt+=ch else: continue return txt def eval_data_generator(self): sample=self.test_list[self.offset:self.offset+self.batch] self.offset+=self.batch speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') txt=txt.replace(' ','') try: data = self.speech_featurizer.load_wav(wp) except: print('{} load data failed'.format(wp)) continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: print('{} duration out of wav_max_duration({})'.format(wp,self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt= self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) max_input = max(max_input, speech_feature.shape[0]) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): print(' {} txt pinyin {} not all in tokens,continue'.format(txt,py)) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): print('{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal(speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2]]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def check_valid(self,txt,vocab_list): if len(txt)==0: return False for n in txt: if n in vocab_list: pass else: return False return True def GuidedAttentionMatrix(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttentionMatrix(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): if train: batch=self.batch if self.augment.available() else self.batch*2 indexs = np.argsort(self.pick_index)[:batch] indexs = random.sample(indexs.tolist(), batch//2) sample = [self.train_list[i] for i in indexs] for i in indexs: self.pick_index[int(i)] += 1 self.epochs =1+ int(np.mean(self.pick_index)) else: sample = random.sample(self.test_list, self.batch) speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('{} load data failed'.format(wp)) continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: print('{} duration out of wav_max_duration({})'.format(wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt= self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py,self.text_featurizer.vocab_array): print(' {} txt pinyin {} not all in tokens,continue'.format(txt, py)) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): print('{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input,len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal(speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2]]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0])*self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def generator(self,train=True): while 1: x, input_length, labels, label_length=self.generate(train) if x.shape[0]==0: print('load data length zero,continue') continue if self.LAS: guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length), label_length.max()) yield x, input_length, labels, label_length,guide_matrix else: yield x, input_length, labels, label_length
class MultiTask_DataLoader(): def __init__(self, config_dict,training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.text4_config = config_dict['decoder4_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config']['batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.token4_featurizer = TextFeaturizer(self.text4_config) self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.LAS=True self.steps = 0 self.init_bert(config_dict) def load_state(self,outdir): try: self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist() self.epochs=1+int(np.mean(self.pick_index)) except FileNotFoundError: print('not found state file') except: print('load state falied,use init state') def save_state(self,outdir): np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index)) def load_bert(self, config, checkpoint): model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None) return model def init_bert(self,config): bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.load_bert(bert_config, bert_checkpoint) def bert_decode(self, x): tokens, segs = [], [] for i in x: t, s = self.bert_token.encode(''.join(i)) tokens.append(t) segs.append(s) return tokens, segs def get_bert_feature(self, bert_t, bert_s): f = [] for t, s in zip(bert_t, bert_s): t = np.expand_dims(np.array(t), 0) s = np.expand_dims(np.array(s), 0) feature = self.bert.predict([t, s]) f.append(feature[0]) return f[0][1:] def return_data_types(self): return (tf.float32, tf.float32, tf.float32,tf.int32, tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32, tf.int32,tf.float32) def return_data_shape(self): f,c=self.speech_featurizer.compute_feature_dim() return ( tf.TensorShape([None,None,f,c]), tf.TensorShape([None,None,1]), tf.TensorShape([None, None, 768]), tf.TensorShape([None,]), tf.TensorShape([None,None]), tf.TensorShape([None,]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None, None]), tf.TensorShape([None, ]), tf.TensorShape([None,None,None]) ) def get_per_epoch_steps(self): return len(self.train_list)//self.batch def eval_per_epoch_steps(self): return len(self.test_list)//self.batch def make_maps(self,config): with open(config['map_path']['pinyin'],encoding='utf-8') as f: data=f.readlines() data=[i.strip() for i in data if i!=''] self.py_map={} for line in data: key,py=line.strip().split('\t') self.py_map[key]=py if len(py.split(' '))>1: for i,j in zip(list(key),py.split(' ')): self.py_map[i]=j with open(config['map_path']['phone'],encoding='utf-8') as f: data=f.readlines() data=[i.strip() for i in data if i!=''] self.phone_map={} phone_map={} for line in data: key,py=line.strip().split('\t') phone_map[key]=py for key in self.py_map.keys(): key_py=self.py_map[key] if len(key)>1: phone=[] for n in key_py.split(' '): phone+=[phone_map[n]] self.phone_map[key]=' '.join(phone) else: self.phone_map[key]=phone_map[self.py_map[key]] def map(self,txt): cut=lcut(txt) pys=[] phones=[] words=[] for i in cut: word=i.word if word in self.py_map.keys(): py=self.py_map[word] phone=self.phone_map[word] pys+=py.split(' ') phones+=phone.split(' ') words+=list(''.join(py.split(' '))) else: for j in word: pys+=[self.py_map[j]] phones+=self.phone_map[j].split(' ') words+=list(''.join(self.py_map[j])) return pys,phones,words def augment_data(self, wavs, label, label_length): if not self.augment.available(): return None mels = [] input_length = [] label_ = [] label_length_ = [] wavs_ = [] max_input = 0 max_wav = 0 for idx, wav in enumerate(wavs): data = self.augment.process(wav.flatten()) speech_feature = self.speech_featurizer.extract(data) if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]: continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) wavs_.append(data) mels.append(speech_feature) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) label_.append(label[idx]) label_length_.append(label_length[idx]) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav) x = np.array(mels, 'float32') label_ = np.array(label_, 'int32') input_length = np.array(input_length, 'int32') label_length_ = np.array(label_length_, 'int32') wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32') return x, wavs_, input_length, label_, label_length_ def make_file_list(self, wav_list,training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data=[i.strip() for i in data if i!=''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.pick_index = [0.] * len(self.train_list) else: self.test_list=data self.offset=0 def only_chinese(self, word): for ch in word: if '\u4e00' <= ch <= '\u9fff': pass else: return False return True def eval_data_generator(self): sample=self.test_list[self.offset:self.offset+self.batch] self.offset+=self.batch mels = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] txt_label = [] txt_label_length = [] bert_features=[] wavs = [] max_wav = 0 max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 max_label_txt = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue speech_feature = self.speech_featurizer.extract(data) max_input = max(max_input, speech_feature.shape[0]) py,phone,word = self.map(txt) if len(py) == 0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) max_wav = max(max_wav, len(data)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature): continue mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) for i in range(len(bert_features)): if bert_features[i].shape[0] < max_label_txt: pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10. bert_features[i] = np.vstack((bert_features[i], pading)) wavs = self.speech_featurizer.pad_signal(wavs, max_wav) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) txt_label = self.pad(txt_label, max_label_txt) x = np.array(mels, 'float32') bert_features = np.array(bert_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') txt_label = np.array(txt_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') txt_label_length = np.array(txt_label_length, 'int32') wavs = np.array(np.expand_dims(wavs, -1), 'float32') return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length def pad(self,words_label,max_label_words): for i in range(len(words_label)): if words_label[i].shape[0] < max_label_words: pad = np.ones(max_label_words - words_label[i].shape[0]) * self.token1_featurizer.pad words_label[i] = np.hstack((words_label[i], pad)) return words_label def GuidedAttention(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): if train: batch=self.batch if self.augment.available() else self.batch*2 indexs = np.argsort(self.pick_index)[:batch] indexs = random.sample(indexs.tolist(), batch//2) sample = [self.train_list[i] for i in indexs] for i in indexs: self.pick_index[int(i)] += 1 self.epochs = 1+int(np.mean(self.pick_index)) else: sample = random.sample(self.test_list, self.batch) mels = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] txt_label = [] txt_label_length = [] bert_features = [] wavs = [] max_wav = 0 max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 max_label_txt = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue speech_feature = self.speech_featurizer.extract(data) py, phone, word = self.map(txt) if len(py) == 0 or len(phone)==0 or len(word)==0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature): continue max_input = max(max_input, speech_feature.shape[0]) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) max_wav = max(max_wav, len(data)) mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: print('load data failed') continue if len(data) < 400: continue elif len(data) > self.speech_featurizer.sample_rate * 7: continue if not self.only_chinese(txt): continue data=self.augment.process(data) speech_feature = self.speech_featurizer.extract(data) py, phone, word = self.map(txt) if len(py) == 0 or len(phone) == 0 or len(word) == 0: continue e_bert_t, e_bert_s = self.bert_decode([txt]) bert_feature = self.get_bert_feature(e_bert_t, e_bert_s) word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) txt_text_feature = self.token4_featurizer.extract(list(txt)) if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \ speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature): continue max_input = max(max_input, speech_feature.shape[0]) max_wav = max(max_wav, len(data)) max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_label_txt = max(max_label_txt, len(txt_text_feature)) mels.append(speech_feature) wavs.append(data) input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor']) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) txt_label.append(np.array(txt_text_feature)) txt_label_length.append(len(txt_text_feature)) bert_features.append(bert_feature) for i in range(len(mels)): if mels[i].shape[0] < max_input: pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min() mels[i] = np.vstack((mels[i], pad)) for i in range(len(bert_features)): if bert_features[i].shape[0]<max_label_txt: pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10. bert_features[i] = np.vstack((bert_features[i], pading)) wavs = self.speech_featurizer.pad_signal(wavs, max_wav) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) txt_label = self.pad(txt_label, max_label_txt) x = np.array(mels, 'float32') bert_features = np.array(bert_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') txt_label = np.array(txt_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') txt_label_length = np.array(txt_label_length, 'int32') wavs = np.array(np.expand_dims(wavs, -1), 'float32') return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length def generator(self,train=True): while 1: x, wavs,bert_feature, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length=self.generate(train) guide_matrix = self.guided_attention(input_length, txt_label_length, np.max(input_length), txt_label_length.max()) yield x, wavs, bert_feature,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length,guide_matrix
class MultiTask_DataLoader(): def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text1_config = config_dict['decoder1_config'] self.text2_config = config_dict['decoder2_config'] self.text3_config = config_dict['decoder3_config'] self.augment_config = config_dict['augments_config'] self.batch = config_dict['learning_config']['running_config'][ 'batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.token1_featurizer = TextFeaturizer(self.text1_config) self.token2_featurizer = TextFeaturizer(self.text2_config) self.token3_featurizer = TextFeaturizer(self.text3_config) self.make_file_list( self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.make_maps(config_dict) self.augment = Augmentation(self.augment_config) self.epochs = 1 self.steps = 0 def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): return (tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f, c = self.speech_featurizer.compute_feature_dim() return ( tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape( [None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), ) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def make_maps(self, config): with open(config['map_path']['phone'], encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] self.phone_map = {} phone_map = {} for line in data: try: key, phone = line.strip().split('\t') except: continue phone_map[key] = phone.split(' ') self.phone_map = phone_map def map(self, txt): pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True) pys = [i[0] for i in pys] phones = [] for i in pys: phones += self.phone_map[i] words = ''.join(pys) words = list(words) return pys, phones, words def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('train list : {} test list:{}'.format( len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 logging.info('eval list: {}'.format(len(self.test_list))) def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def eval_data_generator(self): sample = self.test_list[self.offset:self.offset + self.batch] self.offset += self.batch speech_features = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: continue if not self.check_valid(word, self.token1_featurizer.vocab_array): logging.info( ' {} txt word {} not all in tokens,continue'.format( txt, py)) continue if not self.check_valid(phone, self.token1_featurizer.vocab_array): logging.info( ' {} txt phone {} not all in tokens,continue'.format( txt, py)) continue if not self.check_valid(py, self.token1_featurizer.vocab_array): logging.info( ' {} txt pinyin {} not all in tokens,continue'.format( txt, py)) continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) speech_features = np.array(speech_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length def pad(self, words_label, max_label_words): for i in range(len(words_label)): if words_label[i].shape[0] < max_label_words: pad = np.ones(max_label_words - words_label[i].shape[0] ) * self.token1_featurizer.pad words_label[i] = np.hstack((words_label[i], pad)) return words_label def GuidedAttention(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttention(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): sample = [] speech_features = [] input_length = [] words_label = [] words_label_length = [] phone_label = [] phone_label_length = [] py_label = [] py_label_length = [] max_input = 0 max_label_words = 0 max_label_phone = 0 max_label_py = 0 if train: batch = self.batch // 2 if self.augment.available() else self.batch else: batch = self.batch for i in range(batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wp, txt = line.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: logging.info('py length', len(py), 'skip') continue if self.check_valid( word, self.token1_featurizer.vocab_array) is not True: logging.info( ' {} txt word {} not all in tokens,continue'.format( txt, self.check_valid(word, self.token1_featurizer.vocab_array))) continue # if self.check_valid( phone, self.token2_featurizer.vocab_array) is not True: logging.info( ' {} txt phone {} not all in tokens,continue'.format( txt, self.check_valid(phone, self.token2_featurizer.vocab_array))) continue # if self.check_valid( py, self.token3_featurizer.vocab_array) is not True: logging.info(' {} txt py {} not all in tokens,continue'.format( txt, self.check_valid(py, self.token3_featurizer.vocab_array))) continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) sample.append(line) if len(sample) == batch: break if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py, phone, word = self.map(txt) if len(py) == 0: continue word_text_feature = self.token1_featurizer.extract(word) phone_text_feature = self.token2_featurizer.extract(phone) py_text_feature = self.token3_featurizer.extract(py) if in_len < len(word_text_feature): continue max_label_words = max(max_label_words, len(word_text_feature)) max_label_phone = max(max_label_phone, len(phone_text_feature)) max_label_py = max(max_label_py, len(py_text_feature)) max_input = max(max_input, len(speech_feature)) speech_features.append(speech_feature) input_length.append(in_len) words_label.append(np.array(word_text_feature)) words_label_length.append(len(word_text_feature)) phone_label.append(np.array(phone_text_feature)) phone_label_length.append(len(phone_text_feature)) py_label.append(np.array(py_text_feature)) py_label_length.append(len(py_text_feature)) if self.speech_config['use_mel_layer']: speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) words_label = self.pad(words_label, max_label_words) phone_label = self.pad(phone_label, max_label_phone) py_label = self.pad(py_label, max_label_py) speech_features = np.array(speech_features, 'float32') words_label = np.array(words_label, 'int32') phone_label = np.array(phone_label, 'int32') py_label = np.array(py_label, 'int32') input_length = np.array(input_length, 'int32') words_label_length = np.array(words_label_length, 'int32') phone_label_length = np.array(phone_label_length, 'int32') py_label_length = np.array(py_label_length, 'int32') return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length def generator(self, train=True): while 1: speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length = self.generate( train) yield speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length
class AM_DataLoader(): def __init__(self, config_dict, training=True): self.speech_config = config_dict['speech_config'] self.text_config = config_dict['decoder_config'] self.augment_config = config_dict['augments_config'] self.streaming = self.speech_config['streaming'] self.chunk = self.speech_config['sample_rate'] * self.speech_config[ 'streaming_bucket'] self.batch = config_dict['learning_config']['running_config'][ 'batch_size'] self.speech_featurizer = SpeechFeaturizer(self.speech_config) self.text_featurizer = TextFeaturizer(self.text_config) self.make_file_list( self.speech_config['train_list'] if training else self.speech_config['eval_list'], training) self.augment = Augmentation(self.augment_config) self.init_text_to_vocab() self.epochs = 1 self.LAS = False self.steps = 0 def load_state(self, outdir): try: dg_state = np.load(os.path.join(outdir, 'dg_state.npz')) self.epochs = int(dg_state['epoch']) self.train_offset = int(dg_state['train_offset']) train_list = dg_state['train_list'].tolist() if len(train_list) != len(self.train_list): logging.info( 'history train list not equal new load train list ,data loader use init state' ) self.epochs = 0 self.train_offset = 0 except FileNotFoundError: logging.info('not found state file,init state') except: logging.info('load state falied,use init state') def save_state(self, outdir): np.savez(os.path.join(outdir, 'dg_state.npz'), epoch=self.epochs, train_offset=self.train_offset, train_list=self.train_list) def return_data_types(self): if self.LAS: return (tf.float32, tf.int32, tf.int32, tf.int32, tf.float32) else: return (tf.float32, tf.int32, tf.int32, tf.int32) def return_data_shape(self): f, c = self.speech_featurizer.compute_feature_dim() if self.LAS: return (tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]), tf.TensorShape([None, None, None])) else: return (tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape([None, None, f, c]), tf.TensorShape([ None, ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ])) def get_per_epoch_steps(self): return len(self.train_list) // self.batch def eval_per_epoch_steps(self): return len(self.test_list) // self.batch def init_text_to_vocab(self): pypinyin.load_phrases_dict({ '调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins = pypinyin.pinyin(txt) pins = [i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func def make_file_list(self, wav_list, training=True): with open(wav_list, encoding='utf-8') as f: data = f.readlines() data = [i.strip() for i in data if i != ''] num = len(data) if training: self.train_list = data[:int(num * 0.99)] self.test_list = data[int(num * 0.99):] np.random.shuffle(self.train_list) self.train_offset = 0 self.test_offset = 0 logging.info('load train list {} test list{}'.format( len(self.train_list), len(self.test_list))) else: self.test_list = data self.offset = 0 def only_chinese(self, word): txt = '' for ch in word: if '\u4e00' <= ch <= '\u9fff': txt += ch else: continue return txt def eval_data_generator(self): sample = self.test_list[self.offset:self.offset + self.batch] self.offset += self.batch speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 for i in sample: wp, txt = i.strip().split('\t') txt = txt.replace(' ', '') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: logging.info('{} wav too short < 25ms,skip'.format(wp)) continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}) ,skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * ( self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) max_input = max(max_input, speech_feature.shape[0]) py = self.text_to_vocab(txt) if self.check_valid(py, self.text_featurizer.vocab_array) is not True: logging.info(' {} txt pinyin {} not all in tokens,skip'.format( txt, self.check_valid(py, self.text_featurizer.vocab_array))) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): logging.info( '{} feature length < pinyin length,skip'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: if self.streaming: max_input = max_input // self.chunk * self.chunk + self.chunk speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def check_valid(self, txt, vocab_list): if len(txt) == 0: return False for n in txt: if n in vocab_list: pass else: return n return True def GuidedAttentionMatrix(self, N, T, g=0.2): W = np.zeros((N, T), dtype=np.float32) for n in range(N): for t in range(T): W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 / (2 * g * g)) return W def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape): att_targets = [] for i, j in zip(input_length, targets_length): i = int(i) step = int(j) pad = np.ones([inputs_shape, mel_target_shape]) * -1. pad[i:, :step] = 1 att_target = self.GuidedAttentionMatrix(i, step, 0.2) pad[:att_target.shape[0], :att_target.shape[1]] = att_target att_targets.append(pad) att_targets = np.array(att_targets) return att_targets.astype('float32') def generate(self, train=True): sample = [] speech_features = [] input_length = [] y1 = [] label_length1 = [] max_input = 0 max_label1 = 0 if train: batch = self.batch // 2 if self.augment.available() else self.batch else: batch = self.batch for i in range(batch * 10): if train: line = self.train_list[self.train_offset] self.train_offset += 1 if self.train_offset > len(self.train_list) - 1: self.train_offset = 0 np.random.shuffle(self.train_list) self.epochs += 1 else: line = self.test_list[self.test_offset] self.test_offset += 1 if self.test_offset > len(self.test_list) - 1: self.test_offset = 0 wp, txt = line.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: logging.info('{} load data failed,skip'.format(wp)) continue if len(data) < 400: continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: logging.info( '{} duration out of wav_max_duration({}),skip'.format( wp, self.speech_config['wav_max_duration'])) continue if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if self.check_valid(py, self.text_featurizer.vocab_array) is not True: logging.info( ' {} txt pinyin {} not all in tokens,continue'.format( txt, self.check_valid(py, self.text_featurizer.vocab_array))) continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): logging.info( '{} feature length < pinyin length,continue'.format(wp)) continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) sample.append(line) if len(sample) == batch: break if train and self.augment.available(): for i in sample: wp, txt = i.strip().split('\t') try: data = self.speech_featurizer.load_wav(wp) except: continue if len(data) < 400: logging.info('{} wav too short < 25ms,skip'.format(wp)) continue elif len( data ) > self.speech_featurizer.sample_rate * self.speech_config[ 'wav_max_duration']: continue data = self.augment.process(data) if self.speech_config['only_chinese']: txt = self.only_chinese(txt) if self.speech_config['use_mel_layer']: if not self.streaming: speech_feature = data / np.abs(data).max() speech_feature = np.expand_dims(speech_feature, -1) in_len = len(speech_feature) // ( self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * self.speech_config['stride_ms']) else: speech_feature = data speech_feature = np.expand_dims(speech_feature, -1) reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] in_len = len(speech_feature) // self.chunk if len(speech_feature) % self.chunk != 0: in_len += 1 chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 in_len *= chunk_times else: speech_feature = self.speech_featurizer.extract(data) in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor']) py = self.text_to_vocab(txt) if not self.check_valid(py, self.text_featurizer.vocab_array): continue text_feature = self.text_featurizer.extract(py) if in_len < len(text_feature): continue max_input = max(max_input, len(speech_feature)) max_label1 = max(max_label1, len(text_feature)) speech_features.append(speech_feature) input_length.append(in_len) y1.append(np.array(text_feature)) label_length1.append(len(text_feature)) if self.speech_config['use_mel_layer']: if self.streaming: reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \ self.speech_config['stride_ms'] max_input = max_input // self.chunk * self.chunk + self.chunk max_in_len = max_input // self.chunk chunk_times = self.chunk // reduce if self.chunk % reduce != 0: chunk_times += 1 max_in_len *= chunk_times input_length = np.clip(input_length, 0, max_in_len) speech_features = self.speech_featurizer.pad_signal( speech_features, max_input) else: for i in range(len(speech_features)): if speech_features[i].shape[0] < max_input: pad = np.ones([ max_input - speech_features[i].shape[0], speech_features[i].shape[1], speech_features[i].shape[2] ]) * speech_features[i].min() speech_features[i] = np.vstack((speech_features[i], pad)) for i in range(len(y1)): if y1[i].shape[0] < max_label1: pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad y1[i] = np.hstack((y1[i], pad)) x = np.array(speech_features, 'float32') y1 = np.array(y1, 'int32') input_length = np.array(input_length, 'int32') label_length1 = np.array(label_length1, 'int32') return x, input_length, y1, label_length1 def generator(self, train=True): while 1: x, input_length, labels, label_length = self.generate(train) if x.shape[0] == 0: logging.info('load data length zero,continue') continue if self.LAS: guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length), label_length.max()) yield x, input_length, labels, label_length, guide_matrix else: yield x, input_length, labels, label_length