コード例 #1
0
class AM_DataLoader():

    def __init__(self, config_dict,training=True):
        self.speech_config = config_dict['speech_config']


        self.text_config = config_dict['decoder_config']
        self.augment_config = config_dict['augments_config']

        self.batch = config_dict['learning_config']['running_config']['batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.text_featurizer = TextFeaturizer(self.text_config)
        self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training)
        self.augment = Augmentation(self.augment_config)
        self.init_text_to_vocab()
        self.epochs = 1
        self.LAS=False
        self.steps = 0
    def load_state(self,outdir):
        try:
            self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist()
            self.epochs=1+int(np.mean(self.pick_index))
        except FileNotFoundError:
            print('not found state file')
        except:
            print('load state falied,use init state')
    def save_state(self,outdir):
        np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index))

    def return_data_types(self):
        if self.LAS:
            return (tf.float32, tf.float32, tf.int32, tf.int32, tf.int32,tf.float32)
        else:
            return  (tf.float32, tf.int32, tf.int32, tf.int32)
    def return_data_shape(self):
        f,c=self.speech_featurizer.compute_feature_dim()
        if self.LAS:
            return (
                tf.TensorShape([None,None,1]) if self.speech_config['use_mel_layer'] else  tf.TensorShape([None,None,f,c]),

                tf.TensorShape([None,]),
                tf.TensorShape([None,None]),
                tf.TensorShape([None,]),
                tf.TensorShape([None,None,None])
            )
        else:
            return (
                tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape(
                    [None, None, f, c]),

                tf.TensorShape([None, ]),
                tf.TensorShape([None, None]),
                tf.TensorShape([None, ])
            )
    def get_per_epoch_steps(self):
        return len(self.train_list)//self.batch
    def eval_per_epoch_steps(self):
        return len(self.test_list)//self.batch
    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']],
                                    '调小': [['tiáo'], ['xiǎo']],
                                    '调亮': [['tiáo'], ['liàng']],
                                    '调暗': [['tiáo'], ['àn']],
                                    '肖': [['xiāo']],
                                    '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
                                    '新传': [['xīn'], ['zhuàn']],
                                    '外传': [['wài'], ['zhuàn']],
                                    '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
                                    })

        def text_to_vocab_func(txt):
            pins=pypinyin.pinyin(txt)
            pins=[i[0] for i in pins]
            return pins

        self.text_to_vocab = text_to_vocab_func

    def augment_data(self, wavs, label, label_length):
        if not self.augment.available():
            return None
        mels = []
        input_length = []
        label_ = []
        label_length_ = []
        wavs_ = []
        max_input = 0
        max_wav = 0
        for idx, wav in enumerate(wavs):

            data = self.augment.process(wav.flatten())
            speech_feature = self.speech_featurizer.extract(data)
            if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]:
                continue
            max_input = max(max_input, speech_feature.shape[0])

            max_wav = max(max_wav, len(data))

            wavs_.append(data)

            mels.append(speech_feature)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            label_.append(label[idx])
            label_length_.append(label_length[idx])

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))

        wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav)

        x = np.array(mels, 'float32')
        label_ = np.array(label_, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length_ = np.array(label_length_, 'int32')

        wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32')

        return x, wavs_, input_length, label_, label_length_

    def make_file_list(self, wav_list,training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data=[i.strip()  for i in data if i!='']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.pick_index = [0.] * len(self.train_list)
        else:
            self.test_list=data
            self.offset=0
    def only_chinese(self, word):
        txt=''
        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                txt+=ch
            else:
                continue

        return txt
    def eval_data_generator(self):
        sample=self.test_list[self.offset:self.offset+self.batch]
        self.offset+=self.batch
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []
        max_input = 0
        max_label1 = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            txt=txt.replace(' ','')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('{} load data failed'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate *  self.speech_config['wav_max_duration']:
                print('{} duration out of wav_max_duration({})'.format(wp,self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt= self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            max_input = max(max_input, speech_feature.shape[0])

            py = self.text_to_vocab(txt)
            if not self.check_valid(py, self.text_featurizer.vocab_array):
                print(' {} txt pinyin {} not all in tokens,continue'.format(txt,py))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                print('{} feature length < pinyin length,continue'.format(wp))
                continue
            max_input = max(max_input, len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1],
                                   speech_features[i].shape[2]]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1
    def check_valid(self,txt,vocab_list):
        if len(txt)==0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return False
        return True
    def GuidedAttentionMatrix(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttentionMatrix(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')
    def generate(self, train=True):

        if train:
            batch=self.batch if self.augment.available() else self.batch*2
            indexs = np.argsort(self.pick_index)[:batch]
            indexs = random.sample(indexs.tolist(), batch//2)
            sample = [self.train_list[i] for i in indexs]
            for i in indexs:
                self.pick_index[int(i)] += 1
            self.epochs =1+ int(np.mean(self.pick_index))
        else:
            sample = random.sample(self.test_list, self.batch)
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []

        max_input = 0
        max_label1 = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('{} load data failed'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']:
                print('{} duration out of wav_max_duration({})'.format(wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt= self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor'])


            py = self.text_to_vocab(txt)
            if not self.check_valid(py,self.text_featurizer.vocab_array):
                print(' {} txt pinyin {} not all in tokens,continue'.format(txt, py))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                print('{} feature length < pinyin length,continue'.format(wp))
                continue
            max_input = max(max_input,len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))
        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:
                    print('load data failed')
                    continue
                if len(data) < 400:
                    continue
                elif len(data) > self.speech_featurizer.sample_rate *  self.speech_config['wav_max_duration']:
                    continue
                data = self.augment.process(data)
                if self.speech_config['only_chinese']:
                    txt = self.only_chinese(txt)
                if self.speech_config['use_mel_layer']:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                            self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) *
                            self.speech_config['stride_ms'])
                else:
                    speech_feature = self.speech_featurizer.extract(data)
                    in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor'])

                py = self.text_to_vocab(txt)
                if not self.check_valid(py, self.text_featurizer.vocab_array):
                    continue

                text_feature = self.text_featurizer.extract(py)


                if in_len < len(text_feature):
                    continue
                max_input = max(max_input, len(speech_feature))
                max_label1 = max(max_label1, len(text_feature))
                speech_features.append(speech_feature)

                input_length.append(in_len)
                y1.append(np.array(text_feature))
                label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1],
                                   speech_features[i].shape[2]]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 - y1[i].shape[0])*self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1
    def generator(self,train=True):
        while 1:
            x,  input_length, labels, label_length=self.generate(train)
            if x.shape[0]==0:
                print('load data length zero,continue')
                continue
            if self.LAS:
                guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length),
                                                     label_length.max())
                yield x, input_length, labels, label_length,guide_matrix
            else:
                yield x,  input_length, labels, label_length
コード例 #2
0
ファイル: model.py プロジェクト: X-CCS/TensorflowTTS-1
class TTSmodel():
    def __init__(self, config=None,vocoder_config=None):
        assert config is not None or vocoder_config is not None,'must one config'
        if config is not None:
            self.config = config
            self.acoustic=config['model_name']
        else:
            self.config = None
        self.vocoder_config=vocoder_config

        if vocoder_config is not None:
            self.GL = SpeechFeaturizer(vocoder_config).inv_mel_spectrogram
            self.vocoder_type=vocoder_config['vocoder_model']
        else:
            self.GL = SpeechFeaturizer(config).inv_mel_spectrogram
            self.vocoder_type=None

        if self.config is not None:
            self.text_featurizer=TextFeaturizer(config)

    def load_model(self,training=True):
        if self.config is not None:
            if self.acoustic=='Tacotron2':
                self.config['vocab_size']=self.text_featurizer.num_classes
                tac_config=Tacotron2Config(**self.config)
                self.acoustic_model=TFTacotron2(tac_config,training)
            elif self.acoustic=='FastSpeech':
                self.config['vocab_size']=self.text_featurizer.num_classes
                fast_config=FastSpeechConfig(**self.config)
                self.acoustic_model=TFFastSpeech(fast_config)
        if self.vocoder_config is not None:
            if self.vocoder_type =='MelGan':
                melgan_config=MelGANGeneratorConfig(**self.vocoder_config)
                self.vocoder=TFMelGANGenerator(melgan_config)
            elif self.vocoder_type=='MultiGen':
                multi_config=MultiGeneratorConfig(**self.vocoder_config)
                self.vocoder=TFMultiWindowGenerator(multi_config)
            else:
                raise ValueError('vocoder type not support.')
        if training and self.vocoder_type is not None:
            if self.vocoder_config['use_gan']:
                self.discriminator=TFMelGANMultiScaleDiscriminator(MelGANDiscriminatorConfig(**self.vocoder_config))

        if not training:
            assert self.config is not None
            self.acoustic_model._build()
            if self.vocoder_config is not None:
                self.vocoder._build()
            self.load_checkpoint()

    def load_checkpoint(self,):
        """Load checkpoint."""
        self.checkpoint_dir = os.path.join(self.config["outdir"], "checkpoints")
        files = os.listdir(self.checkpoint_dir)
        files.sort(key=lambda x: int(x.split('_')[-1].replace('.h5', '')))
        self.acoustic_model.load_weights(os.path.join(self.checkpoint_dir, files[-1]))
        logging.info('acoustic load model at {}'.format(os.path.join(self.checkpoint_dir, files[-1])))
        if self.vocoder_config is not None:
            self.checkpoint_dir = os.path.join(self.vocoder_config["outdir"], "checkpoints")
            files = os.listdir(self.checkpoint_dir)
            files= [i for i in files if 'g' in i]
            files.sort(key=lambda x: int(x.split('_')[-1].replace('.h5', '')))
            self.vocoder.load_weights(os.path.join(self.checkpoint_dir, files[-1]))
            logging.info('vocoder load model at {}'.format(os.path.join(self.checkpoint_dir, files[-1])))
    def synthesize(self,text,spk):
        if self.config['model_name']=='Tacotron2':
            inp=self.text_featurizer.extract(text)
            input_length=len(inp)
            spk_id=self.text_featurizer.spker_map[spk]
            inp=np.array(inp,'int32').reshape([1,-1])
            input_length=np.array(input_length,'int32').reshape([1])
            spk_id=np.array([spk_id,0],'int32').reshape([1,-1])
            decoder_output, mel_outputs, stop_token_prediction, alignment_history=self.acoustic_model.inference(input_ids=inp,
                                                                                                                input_lengths=input_length,
                                                                                                                speaker_ids=spk_id,
                                                                                                                use_window_mask=False,
                                                                                                                win_front=5,
                                                                                                                win_back=5,
                                                                                                                maximum_iterations=100,

                                                                                                                )
        else:

            inp = self.text_featurizer.extract(text)

            spk_id = self.text_featurizer.spker_map[spk]
            inp = np.array(inp, 'int32').reshape([1, -1])

            spk_id = np.array(spk_id, 'int32').reshape([1, 1])
            decoder_output, mel_outputs,duration_pred=self.acoustic_model.inference(inp,tf.math.not_equal(inp, 0),spk_id)
        if self.vocoder_config is not None:

            wav=self.vocoder(mel_outputs)
            wav=wav[0].numpy().flatten()
        else:
            wav=self.GL(mel_outputs[0].numpy().T)
        return wav
コード例 #3
0
class MultiTask_DataLoader():

    def __init__(self, config_dict,training=True):
        self.speech_config = config_dict['speech_config']
        self.text1_config = config_dict['decoder1_config']
        self.text2_config = config_dict['decoder2_config']
        self.text3_config = config_dict['decoder3_config']
        self.text4_config = config_dict['decoder4_config']
        self.augment_config = config_dict['augments_config']
        self.batch = config_dict['learning_config']['running_config']['batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.token1_featurizer = TextFeaturizer(self.text1_config)
        self.token2_featurizer = TextFeaturizer(self.text2_config)
        self.token3_featurizer = TextFeaturizer(self.text3_config)
        self.token4_featurizer = TextFeaturizer(self.text4_config)
        self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training)
        self.make_maps(config_dict)
        self.augment = Augmentation(self.augment_config)
        self.epochs = 1
        self.LAS=True
        self.steps = 0

        self.init_bert(config_dict)

    def load_state(self,outdir):
        try:
            self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist()
            self.epochs=1+int(np.mean(self.pick_index))
        except FileNotFoundError:
            print('not found state file')
        except:
            print('load state falied,use init state')
    def save_state(self,outdir):
        np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index))
    def load_bert(self, config, checkpoint):
        model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None)
        return model

    def init_bert(self,config):
        bert_config = config['bert']['config_json']
        bert_checkpoint = config['bert']['bert_ckpt']
        bert_vocab = config['bert']['bert_vocab']
        bert_vocabs = load_vocabulary(bert_vocab)
        self.bert_token = Tokenizer(bert_vocabs)
        self.bert = self.load_bert(bert_config, bert_checkpoint)

    def bert_decode(self, x):
        tokens, segs = [], []

        for i in x:
            t, s = self.bert_token.encode(''.join(i))
            tokens.append(t)
            segs.append(s)
        return tokens, segs
    def get_bert_feature(self, bert_t, bert_s):
        f = []
        for t, s in zip(bert_t, bert_s):
            t = np.expand_dims(np.array(t), 0)
            s = np.expand_dims(np.array(s), 0)
            feature = self.bert.predict([t, s])
            f.append(feature[0])
        return f[0][1:]
    def return_data_types(self):

        return (tf.float32, tf.float32, tf.float32,tf.int32, tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32, tf.int32,tf.float32)

    def return_data_shape(self):
        f,c=self.speech_featurizer.compute_feature_dim()

        return (
            tf.TensorShape([None,None,f,c]),
            tf.TensorShape([None,None,1]),
            tf.TensorShape([None, None, 768]),
            tf.TensorShape([None,]),
            tf.TensorShape([None,None]),
            tf.TensorShape([None,]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, ]),
            tf.TensorShape([None,None,None])
        )

    def get_per_epoch_steps(self):
        return len(self.train_list)//self.batch
    def eval_per_epoch_steps(self):
        return len(self.test_list)//self.batch
    def make_maps(self,config):
        with open(config['map_path']['pinyin'],encoding='utf-8') as f:
            data=f.readlines()
        data=[i.strip() for i in data if i!='']
        self.py_map={}
        for line in data:
            key,py=line.strip().split('\t')
            self.py_map[key]=py
            if len(py.split(' '))>1:
                for i,j in zip(list(key),py.split(' ')):
                    self.py_map[i]=j
        with open(config['map_path']['phone'],encoding='utf-8') as f:
            data=f.readlines()
        data=[i.strip() for i in data if i!='']
        self.phone_map={}
        phone_map={}
        for line in data:
            key,py=line.strip().split('\t')
            phone_map[key]=py
        for key in self.py_map.keys():
            key_py=self.py_map[key]
            if len(key)>1:
                phone=[]
                for n in key_py.split(' '):
                    phone+=[phone_map[n]]
                self.phone_map[key]=' '.join(phone)
            else:
                self.phone_map[key]=phone_map[self.py_map[key]]
    def map(self,txt):
        cut=lcut(txt)
        pys=[]
        phones=[]
        words=[]
        for i in cut:
            word=i.word
            if word in self.py_map.keys():
                py=self.py_map[word]
                phone=self.phone_map[word]
                pys+=py.split(' ')
                phones+=phone.split(' ')
                words+=list(''.join(py.split(' ')))
            else:
                for j in word:
                    pys+=[self.py_map[j]]
                    phones+=self.phone_map[j].split(' ')
                    words+=list(''.join(self.py_map[j]))
        return pys,phones,words

    def augment_data(self, wavs, label, label_length):
        if not self.augment.available():
            return None
        mels = []
        input_length = []
        label_ = []
        label_length_ = []
        wavs_ = []
        max_input = 0
        max_wav = 0
        for idx, wav in enumerate(wavs):

            data = self.augment.process(wav.flatten())
            speech_feature = self.speech_featurizer.extract(data)
            if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]:
                continue
            max_input = max(max_input, speech_feature.shape[0])

            max_wav = max(max_wav, len(data))

            wavs_.append(data)

            mels.append(speech_feature)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            label_.append(label[idx])
            label_length_.append(label_length[idx])

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))

        wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav)

        x = np.array(mels, 'float32')
        label_ = np.array(label_, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length_ = np.array(label_length_, 'int32')

        wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32')

        return x, wavs_, input_length, label_, label_length_

    def make_file_list(self, wav_list,training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data=[i.strip()  for i in data if i!='']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.pick_index = [0.] * len(self.train_list)
        else:
            self.test_list=data
            self.offset=0
    def only_chinese(self, word):

        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                pass
            else:
                return False

        return True
    def eval_data_generator(self):
        sample=self.test_list[self.offset:self.offset+self.batch]
        self.offset+=self.batch
        mels = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        txt_label = []
        txt_label_length = []
        
        bert_features=[]
        wavs = []

        max_wav = 0
        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0
        max_label_txt = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('load data failed')
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate * 7:
                continue

            if not self.only_chinese(txt):
                continue

            speech_feature = self.speech_featurizer.extract(data)
            max_input = max(max_input, speech_feature.shape[0])

            py,phone,word = self.map(txt)
            if len(py) == 0:
                continue
            e_bert_t, e_bert_s = self.bert_decode([txt])
            bert_feature = self.get_bert_feature(e_bert_t, e_bert_s)

            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)
            txt_text_feature = self.token4_featurizer.extract(list(txt))
            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_label_txt = max(max_label_txt, len(txt_text_feature))
        
            max_wav = max(max_wav, len(data))
            if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature):
                continue
            mels.append(speech_feature)
            wavs.append(data)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))

            txt_label.append(np.array(txt_text_feature))
            txt_label_length.append(len(txt_text_feature))
            bert_features.append(bert_feature)

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))

        for i in range(len(bert_features)):

            if bert_features[i].shape[0] < max_label_txt:
                pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10.
                bert_features[i] = np.vstack((bert_features[i], pading))


        wavs = self.speech_featurizer.pad_signal(wavs, max_wav)
        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        txt_label = self.pad(txt_label, max_label_txt)

        x = np.array(mels, 'float32')
        bert_features = np.array(bert_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        txt_label = np.array(txt_label, 'int32')

        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')
        txt_label_length = np.array(txt_label_length, 'int32')

        wavs = np.array(np.expand_dims(wavs, -1), 'float32')

        return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length
    def pad(self,words_label,max_label_words):
        for i in range(len(words_label)):
            if words_label[i].shape[0] < max_label_words:
                pad = np.ones(max_label_words - words_label[i].shape[0]) * self.token1_featurizer.pad
                words_label[i] = np.hstack((words_label[i], pad))
        return words_label
    def GuidedAttention(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttention(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')
    def generate(self, train=True):

        if train:
            batch=self.batch if self.augment.available() else self.batch*2
            indexs = np.argsort(self.pick_index)[:batch]
            indexs = random.sample(indexs.tolist(), batch//2)
            sample = [self.train_list[i] for i in indexs]
            for i in indexs:
                self.pick_index[int(i)] += 1
            self.epochs = 1+int(np.mean(self.pick_index))
        else:
            sample = random.sample(self.test_list, self.batch)

        mels = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        txt_label = []
        txt_label_length = []

        bert_features = []
        wavs = []

        max_wav = 0
        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0
        max_label_txt = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('load data failed')
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate * 7:
                continue

            if not self.only_chinese(txt):
                continue

            speech_feature = self.speech_featurizer.extract(data)


            py, phone, word = self.map(txt)
            if len(py) == 0 or len(phone)==0 or len(word)==0:
                continue
            e_bert_t, e_bert_s = self.bert_decode([txt])
            bert_feature = self.get_bert_feature(e_bert_t, e_bert_s)

            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)
            txt_text_feature = self.token4_featurizer.extract(list(txt))

            if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \
                    speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \
                    speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature):
                continue
            max_input = max(max_input, speech_feature.shape[0])
            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_label_txt = max(max_label_txt, len(txt_text_feature))

            max_wav = max(max_wav, len(data))
            mels.append(speech_feature)
            wavs.append(data)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))

            txt_label.append(np.array(txt_text_feature))
            txt_label_length.append(len(txt_text_feature))
            bert_features.append(bert_feature)


        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:
                    print('load data failed')
                    continue
                if len(data) < 400:
                    continue
                elif len(data) > self.speech_featurizer.sample_rate * 7:
                    continue

                if not self.only_chinese(txt):
                    continue
                data=self.augment.process(data)
                speech_feature = self.speech_featurizer.extract(data)


                py, phone, word = self.map(txt)
                if len(py) == 0 or len(phone) == 0 or len(word) == 0:
                    continue
                e_bert_t, e_bert_s = self.bert_decode([txt])
                bert_feature = self.get_bert_feature(e_bert_t, e_bert_s)

                word_text_feature = self.token1_featurizer.extract(word)
                phone_text_feature = self.token2_featurizer.extract(phone)
                py_text_feature = self.token3_featurizer.extract(py)
                txt_text_feature = self.token4_featurizer.extract(list(txt))



                if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \
                        speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \
                        speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature):
                    continue
                max_input = max(max_input, speech_feature.shape[0])
                max_wav = max(max_wav, len(data))
                max_label_words = max(max_label_words, len(word_text_feature))
                max_label_phone = max(max_label_phone, len(phone_text_feature))
                max_label_py = max(max_label_py, len(py_text_feature))
                max_label_txt = max(max_label_txt, len(txt_text_feature))
                mels.append(speech_feature)
                wavs.append(data)
                input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
                words_label.append(np.array(word_text_feature))
                words_label_length.append(len(word_text_feature))

                phone_label.append(np.array(phone_text_feature))
                phone_label_length.append(len(phone_text_feature))

                py_label.append(np.array(py_text_feature))
                py_label_length.append(len(py_text_feature))

                txt_label.append(np.array(txt_text_feature))
                txt_label_length.append(len(txt_text_feature))
                bert_features.append(bert_feature)

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))
        for i in range(len(bert_features)):
            if bert_features[i].shape[0]<max_label_txt:
                pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10.
                bert_features[i] = np.vstack((bert_features[i], pading))

        wavs = self.speech_featurizer.pad_signal(wavs, max_wav)
        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        txt_label = self.pad(txt_label, max_label_txt)

        x = np.array(mels, 'float32')
        bert_features = np.array(bert_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        txt_label = np.array(txt_label, 'int32')

        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')
        txt_label_length = np.array(txt_label_length, 'int32')

        wavs = np.array(np.expand_dims(wavs, -1), 'float32')

        return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length
    def generator(self,train=True):
        while 1:
            x, wavs,bert_feature, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length=self.generate(train)

            guide_matrix = self.guided_attention(input_length, txt_label_length, np.max(input_length),
                                                 txt_label_length.max())
            yield x, wavs, bert_feature,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length,guide_matrix
コード例 #4
0
class MultiTask_DataLoader():
    def __init__(self, config_dict, training=True):
        self.speech_config = config_dict['speech_config']
        self.text1_config = config_dict['decoder1_config']
        self.text2_config = config_dict['decoder2_config']
        self.text3_config = config_dict['decoder3_config']
        self.augment_config = config_dict['augments_config']
        self.batch = config_dict['learning_config']['running_config'][
            'batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.token1_featurizer = TextFeaturizer(self.text1_config)
        self.token2_featurizer = TextFeaturizer(self.text2_config)
        self.token3_featurizer = TextFeaturizer(self.text3_config)
        self.make_file_list(
            self.speech_config['train_list']
            if training else self.speech_config['eval_list'], training)
        self.make_maps(config_dict)
        self.augment = Augmentation(self.augment_config)
        self.epochs = 1
        self.steps = 0

    def load_state(self, outdir):
        try:

            dg_state = np.load(os.path.join(outdir, 'dg_state.npz'))

            self.epochs = int(dg_state['epoch'])
            self.train_offset = int(dg_state['train_offset'])
            train_list = dg_state['train_list'].tolist()
            if len(train_list) != len(self.train_list):
                logging.info(
                    'history train list not equal train list ,data loader use init state'
                )
                self.epochs = 0
                self.train_offset = 0
        except FileNotFoundError:
            logging.info('not found state file,init state')
        except:
            logging.info('load state falied,use init state')

    def save_state(self, outdir):
        np.savez(os.path.join(outdir, 'dg_state.npz'),
                 epoch=self.epochs,
                 train_offset=self.train_offset,
                 train_list=self.train_list)

    def return_data_types(self):

        return (tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32,
                tf.int32, tf.int32)

    def return_data_shape(self):
        f, c = self.speech_featurizer.compute_feature_dim()

        return (
            tf.TensorShape([None, None, 1])
            if self.speech_config['use_mel_layer'] else tf.TensorShape(
                [None, None, f, c]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
        )

    def get_per_epoch_steps(self):
        return len(self.train_list) // self.batch

    def eval_per_epoch_steps(self):
        return len(self.test_list) // self.batch

    def make_maps(self, config):
        with open(config['map_path']['phone'], encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        self.phone_map = {}
        phone_map = {}
        for line in data:
            try:
                key, phone = line.strip().split('\t')
            except:
                continue
            phone_map[key] = phone.split(' ')
        self.phone_map = phone_map

    def map(self, txt):
        pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True)

        pys = [i[0] for i in pys]
        phones = []

        for i in pys:
            phones += self.phone_map[i]
        words = ''.join(pys)
        words = list(words)
        return pys, phones, words

    def make_file_list(self, wav_list, training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.train_offset = 0
            self.test_offset = 0
            logging.info('train list : {} test list:{}'.format(
                len(self.train_list), len(self.test_list)))
        else:
            self.test_list = data
            self.offset = 0
            logging.info('eval list: {}'.format(len(self.test_list)))

    def only_chinese(self, word):
        txt = ''
        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                txt += ch
            else:
                continue

        return txt

    def check_valid(self, txt, vocab_list):
        if len(txt) == 0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return n
        return True

    def eval_data_generator(self):
        sample = self.test_list[self.offset:self.offset + self.batch]
        self.offset += self.batch
        speech_features = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0

        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}),skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                    self.speech_config['reduction_factor'] *
                    (self.speech_featurizer.sample_rate / 1000) *
                    self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])

            py, phone, word = self.map(txt)
            if len(py) == 0:
                continue

            if not self.check_valid(word, self.token1_featurizer.vocab_array):
                logging.info(
                    ' {} txt word {} not all in tokens,continue'.format(
                        txt, py))
                continue

            if not self.check_valid(phone, self.token1_featurizer.vocab_array):
                logging.info(
                    ' {} txt phone {} not all in tokens,continue'.format(
                        txt, py))
                continue

            if not self.check_valid(py, self.token1_featurizer.vocab_array):
                logging.info(
                    ' {} txt pinyin {} not all in tokens,continue'.format(
                        txt, py))
                continue
            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)

            if in_len < len(word_text_feature):
                continue

            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_input = max(max_input, len(speech_feature))

            speech_features.append(speech_feature)
            input_length.append(in_len)
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        speech_features = np.array(speech_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')

        return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length

    def pad(self, words_label, max_label_words):
        for i in range(len(words_label)):
            if words_label[i].shape[0] < max_label_words:
                pad = np.ones(max_label_words - words_label[i].shape[0]
                              ) * self.token1_featurizer.pad
                words_label[i] = np.hstack((words_label[i], pad))
        return words_label

    def GuidedAttention(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 /
                                     (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape,
                         mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttention(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')

    def generate(self, train=True):
        sample = []
        speech_features = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0
        if train:
            batch = self.batch // 2 if self.augment.available() else self.batch
        else:
            batch = self.batch

        for i in range(batch * 10):
            if train:
                line = self.train_list[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(self.train_list) - 1:
                    self.train_offset = 0
                    np.random.shuffle(self.train_list)
                    self.epochs += 1
            else:
                line = self.test_list[self.test_offset]
                self.test_offset += 1
                if self.test_offset > len(self.test_list) - 1:
                    self.test_offset = 0

            wp, txt = line.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}),skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                    self.speech_config['reduction_factor'] *
                    (self.speech_featurizer.sample_rate / 1000) *
                    self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])

            py, phone, word = self.map(txt)
            if len(py) == 0:
                logging.info('py length', len(py), 'skip')
                continue

            if self.check_valid(
                    word, self.token1_featurizer.vocab_array) is not True:
                logging.info(
                    ' {} txt word {} not all in tokens,continue'.format(
                        txt,
                        self.check_valid(word,
                                         self.token1_featurizer.vocab_array)))
                continue
            #
            if self.check_valid(
                    phone, self.token2_featurizer.vocab_array) is not True:
                logging.info(
                    ' {} txt phone {} not all in tokens,continue'.format(
                        txt,
                        self.check_valid(phone,
                                         self.token2_featurizer.vocab_array)))
                continue
            #
            if self.check_valid(
                    py, self.token3_featurizer.vocab_array) is not True:
                logging.info(' {} txt py {} not all in tokens,continue'.format(
                    txt,
                    self.check_valid(py, self.token3_featurizer.vocab_array)))
                continue
            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)

            if in_len < len(word_text_feature):
                continue

            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_input = max(max_input, len(speech_feature))

            speech_features.append(speech_feature)
            input_length.append(in_len)
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))
            sample.append(line)
            if len(sample) == batch:
                break
        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:

                    continue
                if len(data) < 400:
                    continue
                elif len(
                        data
                ) > self.speech_featurizer.sample_rate * self.speech_config[
                        'wav_max_duration']:

                    continue
                data = self.augment.process(data)
                if self.speech_config['only_chinese']:
                    txt = self.only_chinese(txt)
                if self.speech_config['use_mel_layer']:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] *
                        (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
                else:
                    speech_feature = self.speech_featurizer.extract(data)
                    in_len = int(speech_feature.shape[0] //
                                 self.speech_config['reduction_factor'])

                py, phone, word = self.map(txt)
                if len(py) == 0:
                    continue

                word_text_feature = self.token1_featurizer.extract(word)
                phone_text_feature = self.token2_featurizer.extract(phone)
                py_text_feature = self.token3_featurizer.extract(py)

                if in_len < len(word_text_feature):
                    continue

                max_label_words = max(max_label_words, len(word_text_feature))
                max_label_phone = max(max_label_phone, len(phone_text_feature))
                max_label_py = max(max_label_py, len(py_text_feature))
                max_input = max(max_input, len(speech_feature))

                speech_features.append(speech_feature)
                input_length.append(in_len)
                words_label.append(np.array(word_text_feature))
                words_label_length.append(len(word_text_feature))

                phone_label.append(np.array(phone_text_feature))
                phone_label_length.append(len(phone_text_feature))

                py_label.append(np.array(py_text_feature))
                py_label_length.append(len(py_text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        speech_features = np.array(speech_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')

        return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length

    def generator(self, train=True):
        while 1:
            speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length = self.generate(
                train)

            yield speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length
コード例 #5
0
class TacotronDataLoader():
    def __init__(self, config, training=True):
        self.speech_featurizer = SpeechFeaturizer(config)
        self.text_featurizer = TextFeaturizer(config)
        self.config = config
        self.batch = config['batch_size']
        self.make_file_list(self.config['train_list']
                            if training else self.config['eval_list'],
                            training=training)
        self.min_value = -self.config['max_abs_value']
        self._target_pad = -(self.config['max_abs_value'] + 0.1)
        self._token_pad = 1.
        self.epochs = 1
        self.steps = 0

    def make_file_list(self, wav_list, training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.95)]
            self.test_list = data[int(num * 0.95):]
            np.random.shuffle(self.train_list)
            self.train_offset = 0
            self.test_offset = 0
            logging.info('load train list {} test list{}'.format(
                len(self.train_list), len(self.test_list)))
            if self.config['balance_spk_utts']:
                spk_utt = {}
                for line in self.train_list:
                    a, b, c = line.strip().split('\t')
                    if c in spk_utt:
                        spk_utt[c].append(line)
                    else:
                        spk_utt[c] = [line]
                maxlen = max([len(spk_utt[i]) for i in spk_utt])
                self.train_list = []
                for key in spk_utt:
                    datas = spk_utt[key]
                    if len(datas) < maxlen:
                        factor = int(np.rint(maxlen / len(datas)))
                    else:
                        factor = 1
                    datas *= factor
                    self.train_list += datas
                np.random.shuffle(self.train_list)
                logging.info('balance spk utts: train list {}'.format(
                    len(self.train_list)))
        else:
            self.test_list = data
            self.offset = 0

    def get_per_epoch_steps(self):
        return len(self.train_list) // self.batch

    def eval_per_epoch_steps(self):
        return len(self.test_list) // self.batch

    def return_data_types(self):
        #charactor, char_length, mel, mel_length, stop_gts, speaker, guided_attention
        return (tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.int32,
                tf.float32)

    def return_data_shape(self):
        # charactor, char_length, mel, mel_length, stop_gts, speaker, guided_attention
        return (
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None, self.config['num_mels']]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, None, None]),
        )

    def GuidedAttention(self, N, T, g=0.5):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 /
                                     (2 * g * g))
        return W

    def make_Att_targets(self, input_length, targets_length, inputs_shape,
                         mel_target_shape):
        att_targets = []
        att_mask = []
        mel_target_shape //= self.config['outputs_per_step']
        for i, j in zip(input_length, targets_length):
            # i=inputs_shape
            step = int(j / self.config['outputs_per_step'])
            pad = np.zeros([inputs_shape, mel_target_shape])
            pad[i:, :step] = 1
            maskpad = np.zeros([inputs_shape, mel_target_shape])
            maskpad[:, :step] = 1
            att_target = self.GuidedAttention(i, step, 0.1)

            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
            att_mask.append(maskpad)
        att_targets = np.array(att_targets)
        att_mask = np.array(att_mask)
        return att_targets.astype('float32'), att_mask.astype('float32')

    def load_state(self, outdir):
        try:

            dg_state = np.load(os.path.join(outdir, 'dg_state.npz'))

            self.epochs = int(dg_state['epoch'])
            self.train_offset = int(dg_state['train_offset'])
            train_list = dg_state['train_list'].tolist()
            if len(train_list) != len(self.train_list):
                logging.info(
                    'history train list not equal new load train list ,data loader use init state'
                )
                self.epochs = 0
                self.train_offset = 0
        except FileNotFoundError:
            logging.info('not found state file,init state')
        except:
            logging.info('load state falied,use init state')

    def save_state(self, outdir):

        np.savez(os.path.join(outdir, 'dg_state.npz'),
                 epoch=self.epochs,
                 train_offset=self.train_offset,
                 train_list=self.train_list)

    def extractor(self, ):
        data = self.train_list + self.test_list
        while self.train_offset < len(data):
            charactor, char_length, mel, mel_length, speaker = [], [], [], [], []
            audios = []
            names = []
            for i in range(self.batch * 10):

                line = data[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(data) - 1:
                    break

                wav_path, text, spkid = line.strip().split('\t')
                try:
                    wav = self.speech_featurizer.load_wav(wav_path)
                    target_mel = self.speech_featurizer.melspectrogram(wav)
                # print(target_mel.shape)
                except:
                    logging.info('{} load data failed , skip'.format(wav_path))
                    continue
                try:
                    text_tokens = self.text_featurizer.extract(text)
                except:
                    logging.info('{} to token failed,skip'.format(text))
                    continue
                try:
                    speaker_id = self.text_featurizer.spker_map[spkid]
                except:
                    logging.info('{} not in spk map,skip'.format(spkid))
                    continue
                audios.append(wav)
                names.append(os.path.split(wav_path)[-1].replace('.wav', ''))

                charactor.append(np.array(text_tokens))
                char_length.append(len(text_tokens))
                mel.append(target_mel)
                mel_length.append(len(target_mel))

                speaker.append([speaker_id])
                if len(charactor) == self.batch:
                    break
            output_per_step = self.config['outputs_per_step']
            charactor = self._prepare_inputs(charactor)
            char_length = np.array(char_length, 'int32')
            mel = self._prepare_targets(mel, output_per_step)
            mel_length = np.array(mel_length, 'int32')

            speaker = np.array(speaker, 'int32')
            T = mel.shape[1] * self.speech_featurizer.hop_size
            audios = tf.keras.preprocessing.sequence.pad_sequences(
                audios, T, 'float32', 'post', 'post')
            yield charactor, char_length, mel, mel_length, speaker, audios, names

    def generate(self, train=True):
        charactor, char_length, mel, mel_length, stop_gts, speaker=[],[],[],[],[],[]

        for i in range(self.batch * 10):
            if train:
                line = self.train_list[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(self.train_list) - 1:
                    self.train_offset = 0
                    np.random.shuffle(self.train_list)
                    self.epochs += 1
            else:
                line = self.test_list[self.test_offset]
                self.test_offset += 1
                if self.test_offset > len(self.test_list) - 1:
                    self.test_offset = 0
            wav_path, text, spkid = line.strip().split('\t')
            try:
                wav = self.speech_featurizer.load_wav(wav_path)
                target_mel = self.speech_featurizer.melspectrogram(wav)
            # print(target_mel.shape)
            except:
                logging.info('{} load data failed , skip'.format(wav_path))
                continue
            try:
                text_tokens = self.text_featurizer.extract(text)
            except:
                logging.info('{} to token failed,skip'.format(text))
                continue
            try:
                speaker_id = self.text_featurizer.spker_map[spkid]
            except:
                logging.info('{} not in spk map,skip'.format(spkid))
                continue
            token_target = np.asarray([0.] * (len(target_mel) - 1))
            charactor.append(np.array(text_tokens))
            char_length.append(len(text_tokens))
            mel.append(target_mel)
            mel_length.append(len(target_mel))
            stop_gts.append(token_target)
            speaker.append([speaker_id])
            if len(charactor) == self.batch:
                break
        output_per_step = self.config['outputs_per_step']
        charactor = self._prepare_inputs(charactor)
        char_length = np.array(char_length, 'int32')
        mel = self._prepare_targets(mel, output_per_step)

        mel_length = np.array(mel_length, 'int32')
        stop_gts = self._prepare_token_targets(stop_gts, output_per_step)
        speaker = np.array(speaker, 'int32')

        return charactor, char_length, mel, mel_length, stop_gts, speaker

    def _prepare_inputs(self, inputs):
        max_len = max([len(x) for x in inputs])
        return np.stack([self._pad_input(x, max_len) for x in inputs])

    def _prepare_targets(self, targets, alignment):
        max_len = max([len(t) for t in targets])
        return np.stack([
            self._pad_target(t, self._round_up(max_len, alignment))
            for t in targets
        ])

    def _prepare_token_targets(self, targets, alignment):
        max_len = max([len(t) for t in targets]) + 1
        return np.stack([
            self._pad_token_target(t, self._round_up(max_len, alignment))
            for t in targets
        ])

    def _pad_input(self, x, length):
        return np.pad(x, (0, length - x.shape[0]),
                      mode='constant',
                      constant_values=self.text_featurizer.pad)

    def _pad_target(self, t, length):
        return np.pad(t, [(0, length - t.shape[0]), (0, 0)],
                      mode='constant',
                      constant_values=self._target_pad)

    def _pad_token_target(self, t, length):
        return np.pad(t, (0, length - t.shape[0]),
                      mode='constant',
                      constant_values=self._token_pad)

    def _round_down(self, x, multiple):
        remainder = x % multiple
        return x if remainder == 0 else x - remainder

    def _round_up(self, x, multiple):
        remainder = x % multiple
        return x if remainder == 0 else x + multiple - remainder

    def generator(self, train=True):
        while 1:
            charactor, char_length, mel, mel_length, stop_gts, speaker = self.generate(
                train)

            if charactor.shape[0] == 0:
                logging.info('load data length zero,continue')
                continue
            guide_matrix, _ = self.make_Att_targets(char_length, mel_length,
                                                    np.max(char_length),
                                                    np.max(mel_length))
            yield charactor.astype('int32'), char_length.astype('int32'), mel.astype('float32'), mel_length.astype('int32'), stop_gts.astype('float32'), speaker.astype('int32')\
                ,guide_matrix.astype('float32')
コード例 #6
0
class AM_DataLoader():
    def __init__(self, config_dict, training=True):
        self.speech_config = config_dict['speech_config']

        self.text_config = config_dict['decoder_config']
        self.augment_config = config_dict['augments_config']
        self.streaming = self.speech_config['streaming']
        self.chunk = self.speech_config['sample_rate'] * self.speech_config[
            'streaming_bucket']
        self.batch = config_dict['learning_config']['running_config'][
            'batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.text_featurizer = TextFeaturizer(self.text_config)
        self.make_file_list(
            self.speech_config['train_list']
            if training else self.speech_config['eval_list'], training)
        self.augment = Augmentation(self.augment_config)
        self.init_text_to_vocab()
        self.epochs = 1
        self.LAS = False
        self.steps = 0

    def load_state(self, outdir):
        try:

            dg_state = np.load(os.path.join(outdir, 'dg_state.npz'))

            self.epochs = int(dg_state['epoch'])
            self.train_offset = int(dg_state['train_offset'])
            train_list = dg_state['train_list'].tolist()
            if len(train_list) != len(self.train_list):
                logging.info(
                    'history train list not equal new load train list ,data loader use init state'
                )
                self.epochs = 0
                self.train_offset = 0
        except FileNotFoundError:
            logging.info('not found state file,init state')
        except:
            logging.info('load state falied,use init state')

    def save_state(self, outdir):

        np.savez(os.path.join(outdir, 'dg_state.npz'),
                 epoch=self.epochs,
                 train_offset=self.train_offset,
                 train_list=self.train_list)

    def return_data_types(self):
        if self.LAS:
            return (tf.float32, tf.int32, tf.int32, tf.int32, tf.float32)
        else:
            return (tf.float32, tf.int32, tf.int32, tf.int32)

    def return_data_shape(self):
        f, c = self.speech_featurizer.compute_feature_dim()
        if self.LAS:
            return (tf.TensorShape([None, None, 1])
                    if self.speech_config['use_mel_layer'] else
                    tf.TensorShape([None, None, f, c]), tf.TensorShape([
                        None,
                    ]), tf.TensorShape([None, None]), tf.TensorShape([
                        None,
                    ]), tf.TensorShape([None, None, None]))
        else:
            return (tf.TensorShape([None, None, 1])
                    if self.speech_config['use_mel_layer'] else
                    tf.TensorShape([None, None, f, c]), tf.TensorShape([
                        None,
                    ]), tf.TensorShape([None, None]), tf.TensorShape([
                        None,
                    ]))

    def get_per_epoch_steps(self):
        return len(self.train_list) // self.batch

    def eval_per_epoch_steps(self):
        return len(self.test_list) // self.batch

    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({
            '调大': [['tiáo'], ['dà']],
            '调小': [['tiáo'], ['xiǎo']],
            '调亮': [['tiáo'], ['liàng']],
            '调暗': [['tiáo'], ['àn']],
            '肖': [['xiāo']],
            '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
            '新传': [['xīn'], ['zhuàn']],
            '外传': [['wài'], ['zhuàn']],
            '正传': [['zhèng'], ['zhuàn']],
            '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
        })

        def text_to_vocab_func(txt):
            pins = pypinyin.pinyin(txt)
            pins = [i[0] for i in pins]
            return pins

        self.text_to_vocab = text_to_vocab_func

    def make_file_list(self, wav_list, training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.train_offset = 0
            self.test_offset = 0
            logging.info('load train list {} test list{}'.format(
                len(self.train_list), len(self.test_list)))
        else:
            self.test_list = data
            self.offset = 0

    def only_chinese(self, word):
        txt = ''
        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                txt += ch
            else:
                continue

        return txt

    def eval_data_generator(self):
        sample = self.test_list[self.offset:self.offset + self.batch]
        self.offset += self.batch
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []
        max_input = 0
        max_label1 = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            txt = txt.replace(' ', '')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                logging.info('{} wav too short < 25ms,skip'.format(wp))
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}) ,skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                if not self.streaming:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] *
                        (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
                else:
                    speech_feature = data
                    speech_feature = np.expand_dims(speech_feature, -1)
                    reduce = self.speech_config['reduction_factor'] * (
                        self.speech_featurizer.sample_rate /
                        1000) * self.speech_config['stride_ms']
                    in_len = len(speech_feature) // self.chunk
                    if len(speech_feature) % self.chunk != 0:
                        in_len += 1
                    chunk_times = self.chunk // reduce
                    if self.chunk % reduce != 0:
                        chunk_times += 1
                    in_len *= chunk_times

            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])
            max_input = max(max_input, speech_feature.shape[0])

            py = self.text_to_vocab(txt)
            if self.check_valid(py,
                                self.text_featurizer.vocab_array) is not True:
                logging.info(' {} txt pinyin {} not all in tokens,skip'.format(
                    txt, self.check_valid(py,
                                          self.text_featurizer.vocab_array)))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                logging.info(
                    '{} feature length < pinyin length,skip'.format(wp))
                continue
            max_input = max(max_input, len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            if self.streaming:
                max_input = max_input // self.chunk * self.chunk + self.chunk
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 -
                              y1[i].shape[0]) * self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1

    def check_valid(self, txt, vocab_list):
        if len(txt) == 0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return n
        return True

    def GuidedAttentionMatrix(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 /
                                     (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape,
                         mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttentionMatrix(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')

    def generate(self, train=True):

        sample = []
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []

        max_input = 0
        max_label1 = 0
        if train:
            batch = self.batch // 2 if self.augment.available() else self.batch
        else:
            batch = self.batch

        for i in range(batch * 10):
            if train:
                line = self.train_list[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(self.train_list) - 1:
                    self.train_offset = 0
                    np.random.shuffle(self.train_list)
                    self.epochs += 1
            else:
                line = self.test_list[self.test_offset]
                self.test_offset += 1
                if self.test_offset > len(self.test_list) - 1:
                    self.test_offset = 0
            wp, txt = line.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}),skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                if not self.streaming:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] *
                        (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
                else:
                    speech_feature = data
                    speech_feature = np.expand_dims(speech_feature, -1)
                    reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \
                             self.speech_config['stride_ms']
                    in_len = len(speech_feature) // self.chunk
                    if len(speech_feature) % self.chunk != 0:
                        in_len += 1
                    chunk_times = self.chunk // reduce
                    if self.chunk % reduce != 0:
                        chunk_times += 1
                    in_len *= chunk_times
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])

            py = self.text_to_vocab(txt)
            if self.check_valid(py,
                                self.text_featurizer.vocab_array) is not True:
                logging.info(
                    ' {} txt pinyin {} not all in tokens,continue'.format(
                        txt,
                        self.check_valid(py,
                                         self.text_featurizer.vocab_array)))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                logging.info(
                    '{} feature length < pinyin length,continue'.format(wp))
                continue
            max_input = max(max_input, len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))
            sample.append(line)
            if len(sample) == batch:
                break
        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:
                    continue
                if len(data) < 400:
                    logging.info('{} wav too short < 25ms,skip'.format(wp))
                    continue
                elif len(
                        data
                ) > self.speech_featurizer.sample_rate * self.speech_config[
                        'wav_max_duration']:
                    continue
                data = self.augment.process(data)
                if self.speech_config['only_chinese']:
                    txt = self.only_chinese(txt)
                if self.speech_config['use_mel_layer']:
                    if not self.streaming:
                        speech_feature = data / np.abs(data).max()
                        speech_feature = np.expand_dims(speech_feature, -1)
                        in_len = len(speech_feature) // (
                            self.speech_config['reduction_factor'] *
                            (self.speech_featurizer.sample_rate / 1000) *
                            self.speech_config['stride_ms'])
                    else:
                        speech_feature = data
                        speech_feature = np.expand_dims(speech_feature, -1)
                        reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \
                                 self.speech_config['stride_ms']
                        in_len = len(speech_feature) // self.chunk
                        if len(speech_feature) % self.chunk != 0:
                            in_len += 1
                        chunk_times = self.chunk // reduce
                        if self.chunk % reduce != 0:
                            chunk_times += 1
                        in_len *= chunk_times
                else:
                    speech_feature = self.speech_featurizer.extract(data)
                    in_len = int(speech_feature.shape[0] //
                                 self.speech_config['reduction_factor'])

                py = self.text_to_vocab(txt)
                if not self.check_valid(py, self.text_featurizer.vocab_array):
                    continue

                text_feature = self.text_featurizer.extract(py)

                if in_len < len(text_feature):
                    continue
                max_input = max(max_input, len(speech_feature))
                max_label1 = max(max_label1, len(text_feature))
                speech_features.append(speech_feature)

                input_length.append(in_len)
                y1.append(np.array(text_feature))
                label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            if self.streaming:
                reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \
                         self.speech_config['stride_ms']
                max_input = max_input // self.chunk * self.chunk + self.chunk
                max_in_len = max_input // self.chunk
                chunk_times = self.chunk // reduce
                if self.chunk % reduce != 0:
                    chunk_times += 1
                max_in_len *= chunk_times
                input_length = np.clip(input_length, 0, max_in_len)
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 -
                              y1[i].shape[0]) * self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1

    def generator(self, train=True):
        while 1:
            x, input_length, labels, label_length = self.generate(train)
            if x.shape[0] == 0:
                logging.info('load data length zero,continue')
                continue
            if self.LAS:
                guide_matrix = self.guided_attention(input_length,
                                                     label_length,
                                                     np.max(input_length),
                                                     label_length.max())
                yield x, input_length, labels, label_length, guide_matrix
            else:
                yield x, input_length, labels, label_length