Python Augmentation Examples

Programming Language: Python

Namespace/Package Name: augmentations.augments

Class/Type: Augmentation

Examples at hotexamples.com: 6

Python Augmentation - 6 examples found. These are the top rated real world Python examples of augmentations.augments.Augmentation extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

process(4)

available(4)

Augmentation(2)

Example #1

Show file

    def __init__(self, config_dict,training=True):
        self.speech_config = config_dict['speech_config']


        self.text_config = config_dict['decoder_config']
        self.augment_config = config_dict['augments_config']

        self.batch = config_dict['learning_config']['running_config']['batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.text_featurizer = TextFeaturizer(self.text_config)
        self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training)
        self.augment = Augmentation(self.augment_config)
        self.init_text_to_vocab()
        self.epochs = 1
        self.LAS=False
        self.steps = 0

Example #2

Show file

 def __init__(self, config_dict, training=True):
     self.speech_config = config_dict['speech_config']
     self.text1_config = config_dict['decoder1_config']
     self.text2_config = config_dict['decoder2_config']
     self.text3_config = config_dict['decoder3_config']
     self.augment_config = config_dict['augments_config']
     self.batch = config_dict['learning_config']['running_config']['batch_size']
     self.speech_featurizer = SpeechFeaturizer(self.speech_config)
     self.token1_featurizer = TextFeaturizer(self.text1_config)
     self.token2_featurizer = TextFeaturizer(self.text2_config)
     self.token3_featurizer = TextFeaturizer(self.text3_config)
     self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'], training)
     self.make_maps(config_dict)
     self.augment = Augmentation(self.augment_config)
     self.epochs = 1
     self.steps = 0

Example #3

Show file

class AM_DataLoader():

    def __init__(self, config_dict,training=True):
        self.speech_config = config_dict['speech_config']


        self.text_config = config_dict['decoder_config']
        self.augment_config = config_dict['augments_config']

        self.batch = config_dict['learning_config']['running_config']['batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.text_featurizer = TextFeaturizer(self.text_config)
        self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training)
        self.augment = Augmentation(self.augment_config)
        self.init_text_to_vocab()
        self.epochs = 1
        self.LAS=False
        self.steps = 0
    def load_state(self,outdir):
        try:
            self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist()
            self.epochs=1+int(np.mean(self.pick_index))
        except FileNotFoundError:
            print('not found state file')
        except:
            print('load state falied,use init state')
    def save_state(self,outdir):
        np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index))

    def return_data_types(self):
        if self.LAS:
            return (tf.float32, tf.float32, tf.int32, tf.int32, tf.int32,tf.float32)
        else:
            return  (tf.float32, tf.int32, tf.int32, tf.int32)
    def return_data_shape(self):
        f,c=self.speech_featurizer.compute_feature_dim()
        if self.LAS:
            return (
                tf.TensorShape([None,None,1]) if self.speech_config['use_mel_layer'] else  tf.TensorShape([None,None,f,c]),

                tf.TensorShape([None,]),
                tf.TensorShape([None,None]),
                tf.TensorShape([None,]),
                tf.TensorShape([None,None,None])
            )
        else:
            return (
                tf.TensorShape([None, None, 1]) if self.speech_config['use_mel_layer'] else tf.TensorShape(
                    [None, None, f, c]),

                tf.TensorShape([None, ]),
                tf.TensorShape([None, None]),
                tf.TensorShape([None, ])
            )
    def get_per_epoch_steps(self):
        return len(self.train_list)//self.batch
    def eval_per_epoch_steps(self):
        return len(self.test_list)//self.batch
    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']],
                                    '调小': [['tiáo'], ['xiǎo']],
                                    '调亮': [['tiáo'], ['liàng']],
                                    '调暗': [['tiáo'], ['àn']],
                                    '肖': [['xiāo']],
                                    '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
                                    '新传': [['xīn'], ['zhuàn']],
                                    '外传': [['wài'], ['zhuàn']],
                                    '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
                                    })

        def text_to_vocab_func(txt):
            pins=pypinyin.pinyin(txt)
            pins=[i[0] for i in pins]
            return pins

        self.text_to_vocab = text_to_vocab_func

    def augment_data(self, wavs, label, label_length):
        if not self.augment.available():
            return None
        mels = []
        input_length = []
        label_ = []
        label_length_ = []
        wavs_ = []
        max_input = 0
        max_wav = 0
        for idx, wav in enumerate(wavs):

            data = self.augment.process(wav.flatten())
            speech_feature = self.speech_featurizer.extract(data)
            if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]:
                continue
            max_input = max(max_input, speech_feature.shape[0])

            max_wav = max(max_wav, len(data))

            wavs_.append(data)

            mels.append(speech_feature)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            label_.append(label[idx])
            label_length_.append(label_length[idx])

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))

        wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav)

        x = np.array(mels, 'float32')
        label_ = np.array(label_, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length_ = np.array(label_length_, 'int32')

        wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32')

        return x, wavs_, input_length, label_, label_length_

    def make_file_list(self, wav_list,training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data=[i.strip()  for i in data if i!='']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.pick_index = [0.] * len(self.train_list)
        else:
            self.test_list=data
            self.offset=0
    def only_chinese(self, word):
        txt=''
        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                txt+=ch
            else:
                continue

        return txt
    def eval_data_generator(self):
        sample=self.test_list[self.offset:self.offset+self.batch]
        self.offset+=self.batch
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []
        max_input = 0
        max_label1 = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            txt=txt.replace(' ','')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('{} load data failed'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate *  self.speech_config['wav_max_duration']:
                print('{} duration out of wav_max_duration({})'.format(wp,self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt= self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            max_input = max(max_input, speech_feature.shape[0])

            py = self.text_to_vocab(txt)
            if not self.check_valid(py, self.text_featurizer.vocab_array):
                print(' {} txt pinyin {} not all in tokens,continue'.format(txt,py))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                print('{} feature length < pinyin length,continue'.format(wp))
                continue
            max_input = max(max_input, len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1],
                                   speech_features[i].shape[2]]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 - y1[i].shape[0]) * self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1
    def check_valid(self,txt,vocab_list):
        if len(txt)==0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return False
        return True
    def GuidedAttentionMatrix(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttentionMatrix(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')
    def generate(self, train=True):

        if train:
            batch=self.batch if self.augment.available() else self.batch*2
            indexs = np.argsort(self.pick_index)[:batch]
            indexs = random.sample(indexs.tolist(), batch//2)
            sample = [self.train_list[i] for i in indexs]
            for i in indexs:
                self.pick_index[int(i)] += 1
            self.epochs =1+ int(np.mean(self.pick_index))
        else:
            sample = random.sample(self.test_list, self.batch)
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []

        max_input = 0
        max_label1 = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('{} load data failed'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate * self.speech_config['wav_max_duration']:
                print('{} duration out of wav_max_duration({})'.format(wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt= self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor'])


            py = self.text_to_vocab(txt)
            if not self.check_valid(py,self.text_featurizer.vocab_array):
                print(' {} txt pinyin {} not all in tokens,continue'.format(txt, py))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                print('{} feature length < pinyin length,continue'.format(wp))
                continue
            max_input = max(max_input,len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))
        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:
                    print('load data failed')
                    continue
                if len(data) < 400:
                    continue
                elif len(data) > self.speech_featurizer.sample_rate *  self.speech_config['wav_max_duration']:
                    continue
                data = self.augment.process(data)
                if self.speech_config['only_chinese']:
                    txt = self.only_chinese(txt)
                if self.speech_config['use_mel_layer']:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                            self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) *
                            self.speech_config['stride_ms'])
                else:
                    speech_feature = self.speech_featurizer.extract(data)
                    in_len = int(speech_feature.shape[0] // self.speech_config['reduction_factor'])

                py = self.text_to_vocab(txt)
                if not self.check_valid(py, self.text_featurizer.vocab_array):
                    continue

                text_feature = self.text_featurizer.extract(py)


                if in_len < len(text_feature):
                    continue
                max_input = max(max_input, len(speech_feature))
                max_label1 = max(max_label1, len(text_feature))
                speech_features.append(speech_feature)

                input_length.append(in_len)
                y1.append(np.array(text_feature))
                label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([max_input - speech_features[i].shape[0], speech_features[i].shape[1],
                                   speech_features[i].shape[2]]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 - y1[i].shape[0])*self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1
    def generator(self,train=True):
        while 1:
            x,  input_length, labels, label_length=self.generate(train)
            if x.shape[0]==0:
                print('load data length zero,continue')
                continue
            if self.LAS:
                guide_matrix = self.guided_attention(input_length, label_length, np.max(input_length),
                                                     label_length.max())
                yield x, input_length, labels, label_length,guide_matrix
            else:
                yield x,  input_length, labels, label_length

Example #4

Show file

File: multi_task_dataloader.py Project: rxhmdia/TensorflowASR-1

class MultiTask_DataLoader():

    def __init__(self, config_dict,training=True):
        self.speech_config = config_dict['speech_config']
        self.text1_config = config_dict['decoder1_config']
        self.text2_config = config_dict['decoder2_config']
        self.text3_config = config_dict['decoder3_config']
        self.text4_config = config_dict['decoder4_config']
        self.augment_config = config_dict['augments_config']
        self.batch = config_dict['learning_config']['running_config']['batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.token1_featurizer = TextFeaturizer(self.text1_config)
        self.token2_featurizer = TextFeaturizer(self.text2_config)
        self.token3_featurizer = TextFeaturizer(self.text3_config)
        self.token4_featurizer = TextFeaturizer(self.text4_config)
        self.make_file_list(self.speech_config['train_list'] if training else self.speech_config['eval_list'],training)
        self.make_maps(config_dict)
        self.augment = Augmentation(self.augment_config)
        self.epochs = 1
        self.LAS=True
        self.steps = 0

        self.init_bert(config_dict)

    def load_state(self,outdir):
        try:
            self.pick_index=np.load(os.path.join(outdir,'dg_state.npy')).flatten().tolist()
            self.epochs=1+int(np.mean(self.pick_index))
        except FileNotFoundError:
            print('not found state file')
        except:
            print('load state falied,use init state')
    def save_state(self,outdir):
        np.save(os.path.join(outdir,'dg_state.npy'),np.array(self.pick_index))
    def load_bert(self, config, checkpoint):
        model = load_trained_model_from_checkpoint(config, checkpoint, trainable=False, seq_len=None)
        return model

    def init_bert(self,config):
        bert_config = config['bert']['config_json']
        bert_checkpoint = config['bert']['bert_ckpt']
        bert_vocab = config['bert']['bert_vocab']
        bert_vocabs = load_vocabulary(bert_vocab)
        self.bert_token = Tokenizer(bert_vocabs)
        self.bert = self.load_bert(bert_config, bert_checkpoint)

    def bert_decode(self, x):
        tokens, segs = [], []

        for i in x:
            t, s = self.bert_token.encode(''.join(i))
            tokens.append(t)
            segs.append(s)
        return tokens, segs
    def get_bert_feature(self, bert_t, bert_s):
        f = []
        for t, s in zip(bert_t, bert_s):
            t = np.expand_dims(np.array(t), 0)
            s = np.expand_dims(np.array(s), 0)
            feature = self.bert.predict([t, s])
            f.append(feature[0])
        return f[0][1:]
    def return_data_types(self):

        return (tf.float32, tf.float32, tf.float32,tf.int32, tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32,tf.int32, tf.int32,tf.float32)

    def return_data_shape(self):
        f,c=self.speech_featurizer.compute_feature_dim()

        return (
            tf.TensorShape([None,None,f,c]),
            tf.TensorShape([None,None,1]),
            tf.TensorShape([None, None, 768]),
            tf.TensorShape([None,]),
            tf.TensorShape([None,None]),
            tf.TensorShape([None,]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, ]),
            tf.TensorShape([None,None,None])
        )

    def get_per_epoch_steps(self):
        return len(self.train_list)//self.batch
    def eval_per_epoch_steps(self):
        return len(self.test_list)//self.batch
    def make_maps(self,config):
        with open(config['map_path']['pinyin'],encoding='utf-8') as f:
            data=f.readlines()
        data=[i.strip() for i in data if i!='']
        self.py_map={}
        for line in data:
            key,py=line.strip().split('\t')
            self.py_map[key]=py
            if len(py.split(' '))>1:
                for i,j in zip(list(key),py.split(' ')):
                    self.py_map[i]=j
        with open(config['map_path']['phone'],encoding='utf-8') as f:
            data=f.readlines()
        data=[i.strip() for i in data if i!='']
        self.phone_map={}
        phone_map={}
        for line in data:
            key,py=line.strip().split('\t')
            phone_map[key]=py
        for key in self.py_map.keys():
            key_py=self.py_map[key]
            if len(key)>1:
                phone=[]
                for n in key_py.split(' '):
                    phone+=[phone_map[n]]
                self.phone_map[key]=' '.join(phone)
            else:
                self.phone_map[key]=phone_map[self.py_map[key]]
    def map(self,txt):
        cut=lcut(txt)
        pys=[]
        phones=[]
        words=[]
        for i in cut:
            word=i.word
            if word in self.py_map.keys():
                py=self.py_map[word]
                phone=self.phone_map[word]
                pys+=py.split(' ')
                phones+=phone.split(' ')
                words+=list(''.join(py.split(' ')))
            else:
                for j in word:
                    pys+=[self.py_map[j]]
                    phones+=self.phone_map[j].split(' ')
                    words+=list(''.join(self.py_map[j]))
        return pys,phones,words

    def augment_data(self, wavs, label, label_length):
        if not self.augment.available():
            return None
        mels = []
        input_length = []
        label_ = []
        label_length_ = []
        wavs_ = []
        max_input = 0
        max_wav = 0
        for idx, wav in enumerate(wavs):

            data = self.augment.process(wav.flatten())
            speech_feature = self.speech_featurizer.extract(data)
            if speech_feature.shape[0] // self.speech_config['reduction_factor'] < label_length[idx]:
                continue
            max_input = max(max_input, speech_feature.shape[0])

            max_wav = max(max_wav, len(data))

            wavs_.append(data)

            mels.append(speech_feature)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            label_.append(label[idx])
            label_length_.append(label_length[idx])

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1],mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))

        wavs_ = self.speech_featurizer.pad_signal(wavs_, max_wav)

        x = np.array(mels, 'float32')
        label_ = np.array(label_, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length_ = np.array(label_length_, 'int32')

        wavs_ = np.array(np.expand_dims(wavs_, -1), 'float32')

        return x, wavs_, input_length, label_, label_length_

    def make_file_list(self, wav_list,training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data=[i.strip()  for i in data if i!='']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.pick_index = [0.] * len(self.train_list)
        else:
            self.test_list=data
            self.offset=0
    def only_chinese(self, word):

        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                pass
            else:
                return False

        return True
    def eval_data_generator(self):
        sample=self.test_list[self.offset:self.offset+self.batch]
        self.offset+=self.batch
        mels = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        txt_label = []
        txt_label_length = []
        
        bert_features=[]
        wavs = []

        max_wav = 0
        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0
        max_label_txt = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('load data failed')
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate * 7:
                continue

            if not self.only_chinese(txt):
                continue

            speech_feature = self.speech_featurizer.extract(data)
            max_input = max(max_input, speech_feature.shape[0])

            py,phone,word = self.map(txt)
            if len(py) == 0:
                continue
            e_bert_t, e_bert_s = self.bert_decode([txt])
            bert_feature = self.get_bert_feature(e_bert_t, e_bert_s)

            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)
            txt_text_feature = self.token4_featurizer.extract(list(txt))
            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_label_txt = max(max_label_txt, len(txt_text_feature))
        
            max_wav = max(max_wav, len(data))
            if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature):
                continue
            mels.append(speech_feature)
            wavs.append(data)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))

            txt_label.append(np.array(txt_text_feature))
            txt_label_length.append(len(txt_text_feature))
            bert_features.append(bert_feature)

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))

        for i in range(len(bert_features)):

            if bert_features[i].shape[0] < max_label_txt:
                pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10.
                bert_features[i] = np.vstack((bert_features[i], pading))


        wavs = self.speech_featurizer.pad_signal(wavs, max_wav)
        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        txt_label = self.pad(txt_label, max_label_txt)

        x = np.array(mels, 'float32')
        bert_features = np.array(bert_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        txt_label = np.array(txt_label, 'int32')

        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')
        txt_label_length = np.array(txt_label_length, 'int32')

        wavs = np.array(np.expand_dims(wavs, -1), 'float32')

        return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length
    def pad(self,words_label,max_label_words):
        for i in range(len(words_label)):
            if words_label[i].shape[0] < max_label_words:
                pad = np.ones(max_label_words - words_label[i].shape[0]) * self.token1_featurizer.pad
                words_label[i] = np.hstack((words_label[i], pad))
        return words_label
    def GuidedAttention(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N)) ** 2 / (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape, mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttention(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')
    def generate(self, train=True):

        if train:
            batch=self.batch if self.augment.available() else self.batch*2
            indexs = np.argsort(self.pick_index)[:batch]
            indexs = random.sample(indexs.tolist(), batch//2)
            sample = [self.train_list[i] for i in indexs]
            for i in indexs:
                self.pick_index[int(i)] += 1
            self.epochs = 1+int(np.mean(self.pick_index))
        else:
            sample = random.sample(self.test_list, self.batch)

        mels = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        txt_label = []
        txt_label_length = []

        bert_features = []
        wavs = []

        max_wav = 0
        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0
        max_label_txt = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                print('load data failed')
                continue
            if len(data) < 400:
                continue
            elif len(data) > self.speech_featurizer.sample_rate * 7:
                continue

            if not self.only_chinese(txt):
                continue

            speech_feature = self.speech_featurizer.extract(data)


            py, phone, word = self.map(txt)
            if len(py) == 0 or len(phone)==0 or len(word)==0:
                continue
            e_bert_t, e_bert_s = self.bert_decode([txt])
            bert_feature = self.get_bert_feature(e_bert_t, e_bert_s)

            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)
            txt_text_feature = self.token4_featurizer.extract(list(txt))

            if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \
                    speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \
                    speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature):
                continue
            max_input = max(max_input, speech_feature.shape[0])
            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_label_txt = max(max_label_txt, len(txt_text_feature))

            max_wav = max(max_wav, len(data))
            mels.append(speech_feature)
            wavs.append(data)
            input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))

            txt_label.append(np.array(txt_text_feature))
            txt_label_length.append(len(txt_text_feature))
            bert_features.append(bert_feature)


        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:
                    print('load data failed')
                    continue
                if len(data) < 400:
                    continue
                elif len(data) > self.speech_featurizer.sample_rate * 7:
                    continue

                if not self.only_chinese(txt):
                    continue
                data=self.augment.process(data)
                speech_feature = self.speech_featurizer.extract(data)


                py, phone, word = self.map(txt)
                if len(py) == 0 or len(phone) == 0 or len(word) == 0:
                    continue
                e_bert_t, e_bert_s = self.bert_decode([txt])
                bert_feature = self.get_bert_feature(e_bert_t, e_bert_s)

                word_text_feature = self.token1_featurizer.extract(word)
                phone_text_feature = self.token2_featurizer.extract(phone)
                py_text_feature = self.token3_featurizer.extract(py)
                txt_text_feature = self.token4_featurizer.extract(list(txt))



                if speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(py_text_feature) or \
                        speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(word_text_feature) or \
                        speech_feature.shape[0] / self.speech_config['reduction_factor'] < len(phone_text_feature):
                    continue
                max_input = max(max_input, speech_feature.shape[0])
                max_wav = max(max_wav, len(data))
                max_label_words = max(max_label_words, len(word_text_feature))
                max_label_phone = max(max_label_phone, len(phone_text_feature))
                max_label_py = max(max_label_py, len(py_text_feature))
                max_label_txt = max(max_label_txt, len(txt_text_feature))
                mels.append(speech_feature)
                wavs.append(data)
                input_length.append(speech_feature.shape[0] // self.speech_config['reduction_factor'])
                words_label.append(np.array(word_text_feature))
                words_label_length.append(len(word_text_feature))

                phone_label.append(np.array(phone_text_feature))
                phone_label_length.append(len(phone_text_feature))

                py_label.append(np.array(py_text_feature))
                py_label_length.append(len(py_text_feature))

                txt_label.append(np.array(txt_text_feature))
                txt_label_length.append(len(txt_text_feature))
                bert_features.append(bert_feature)

        for i in range(len(mels)):
            if mels[i].shape[0] < max_input:
                pad = np.ones([max_input - mels[i].shape[0], mels[i].shape[1], mels[i].shape[2]]) * mels[i].min()
                mels[i] = np.vstack((mels[i], pad))
        for i in range(len(bert_features)):
            if bert_features[i].shape[0]<max_label_txt:
                pading = np.ones([max_label_txt - len(bert_features[i]), 768]) * -10.
                bert_features[i] = np.vstack((bert_features[i], pading))

        wavs = self.speech_featurizer.pad_signal(wavs, max_wav)
        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        txt_label = self.pad(txt_label, max_label_txt)

        x = np.array(mels, 'float32')
        bert_features = np.array(bert_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        txt_label = np.array(txt_label, 'int32')

        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')
        txt_label_length = np.array(txt_label_length, 'int32')

        wavs = np.array(np.expand_dims(wavs, -1), 'float32')

        return x, wavs, bert_features,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length
    def generator(self,train=True):
        while 1:
            x, wavs,bert_feature, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length=self.generate(train)

            guide_matrix = self.guided_attention(input_length, txt_label_length, np.max(input_length),
                                                 txt_label_length.max())
            yield x, wavs, bert_feature,input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length, txt_label, txt_label_length,guide_matrix

Example #5

Show file

class MultiTask_DataLoader():
    def __init__(self, config_dict, training=True):
        self.speech_config = config_dict['speech_config']
        self.text1_config = config_dict['decoder1_config']
        self.text2_config = config_dict['decoder2_config']
        self.text3_config = config_dict['decoder3_config']
        self.augment_config = config_dict['augments_config']
        self.batch = config_dict['learning_config']['running_config'][
            'batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.token1_featurizer = TextFeaturizer(self.text1_config)
        self.token2_featurizer = TextFeaturizer(self.text2_config)
        self.token3_featurizer = TextFeaturizer(self.text3_config)
        self.make_file_list(
            self.speech_config['train_list']
            if training else self.speech_config['eval_list'], training)
        self.make_maps(config_dict)
        self.augment = Augmentation(self.augment_config)
        self.epochs = 1
        self.steps = 0

    def load_state(self, outdir):
        try:

            dg_state = np.load(os.path.join(outdir, 'dg_state.npz'))

            self.epochs = int(dg_state['epoch'])
            self.train_offset = int(dg_state['train_offset'])
            train_list = dg_state['train_list'].tolist()
            if len(train_list) != len(self.train_list):
                logging.info(
                    'history train list not equal train list ,data loader use init state'
                )
                self.epochs = 0
                self.train_offset = 0
        except FileNotFoundError:
            logging.info('not found state file,init state')
        except:
            logging.info('load state falied,use init state')

    def save_state(self, outdir):
        np.savez(os.path.join(outdir, 'dg_state.npz'),
                 epoch=self.epochs,
                 train_offset=self.train_offset,
                 train_list=self.train_list)

    def return_data_types(self):

        return (tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32,
                tf.int32, tf.int32)

    def return_data_shape(self):
        f, c = self.speech_featurizer.compute_feature_dim()

        return (
            tf.TensorShape([None, None, 1])
            if self.speech_config['use_mel_layer'] else tf.TensorShape(
                [None, None, f, c]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                None,
            ]),
        )

    def get_per_epoch_steps(self):
        return len(self.train_list) // self.batch

    def eval_per_epoch_steps(self):
        return len(self.test_list) // self.batch

    def make_maps(self, config):
        with open(config['map_path']['phone'], encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        self.phone_map = {}
        phone_map = {}
        for line in data:
            try:
                key, phone = line.strip().split('\t')
            except:
                continue
            phone_map[key] = phone.split(' ')
        self.phone_map = phone_map

    def map(self, txt):
        pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True)

        pys = [i[0] for i in pys]
        phones = []

        for i in pys:
            phones += self.phone_map[i]
        words = ''.join(pys)
        words = list(words)
        return pys, phones, words

    def make_file_list(self, wav_list, training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.train_offset = 0
            self.test_offset = 0
            logging.info('train list : {} test list:{}'.format(
                len(self.train_list), len(self.test_list)))
        else:
            self.test_list = data
            self.offset = 0
            logging.info('eval list: {}'.format(len(self.test_list)))

    def only_chinese(self, word):
        txt = ''
        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                txt += ch
            else:
                continue

        return txt

    def check_valid(self, txt, vocab_list):
        if len(txt) == 0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return n
        return True

    def eval_data_generator(self):
        sample = self.test_list[self.offset:self.offset + self.batch]
        self.offset += self.batch
        speech_features = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0

        for i in sample:
            wp, txt = i.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}),skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                    self.speech_config['reduction_factor'] *
                    (self.speech_featurizer.sample_rate / 1000) *
                    self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])

            py, phone, word = self.map(txt)
            if len(py) == 0:
                continue

            if not self.check_valid(word, self.token1_featurizer.vocab_array):
                logging.info(
                    ' {} txt word {} not all in tokens,continue'.format(
                        txt, py))
                continue

            if not self.check_valid(phone, self.token1_featurizer.vocab_array):
                logging.info(
                    ' {} txt phone {} not all in tokens,continue'.format(
                        txt, py))
                continue

            if not self.check_valid(py, self.token1_featurizer.vocab_array):
                logging.info(
                    ' {} txt pinyin {} not all in tokens,continue'.format(
                        txt, py))
                continue
            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)

            if in_len < len(word_text_feature):
                continue

            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_input = max(max_input, len(speech_feature))

            speech_features.append(speech_feature)
            input_length.append(in_len)
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        speech_features = np.array(speech_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')

        return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length

    def pad(self, words_label, max_label_words):
        for i in range(len(words_label)):
            if words_label[i].shape[0] < max_label_words:
                pad = np.ones(max_label_words - words_label[i].shape[0]
                              ) * self.token1_featurizer.pad
                words_label[i] = np.hstack((words_label[i], pad))
        return words_label

    def GuidedAttention(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 /
                                     (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape,
                         mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttention(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')

    def generate(self, train=True):
        sample = []
        speech_features = []
        input_length = []

        words_label = []
        words_label_length = []

        phone_label = []
        phone_label_length = []

        py_label = []
        py_label_length = []

        max_input = 0
        max_label_words = 0
        max_label_phone = 0
        max_label_py = 0
        if train:
            batch = self.batch // 2 if self.augment.available() else self.batch
        else:
            batch = self.batch

        for i in range(batch * 10):
            if train:
                line = self.train_list[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(self.train_list) - 1:
                    self.train_offset = 0
                    np.random.shuffle(self.train_list)
                    self.epochs += 1
            else:
                line = self.test_list[self.test_offset]
                self.test_offset += 1
                if self.test_offset > len(self.test_list) - 1:
                    self.test_offset = 0

            wp, txt = line.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}),skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                speech_feature = data / np.abs(data).max()
                speech_feature = np.expand_dims(speech_feature, -1)
                in_len = len(speech_feature) // (
                    self.speech_config['reduction_factor'] *
                    (self.speech_featurizer.sample_rate / 1000) *
                    self.speech_config['stride_ms'])
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])

            py, phone, word = self.map(txt)
            if len(py) == 0:
                logging.info('py length', len(py), 'skip')
                continue

            if self.check_valid(
                    word, self.token1_featurizer.vocab_array) is not True:
                logging.info(
                    ' {} txt word {} not all in tokens,continue'.format(
                        txt,
                        self.check_valid(word,
                                         self.token1_featurizer.vocab_array)))
                continue
            #
            if self.check_valid(
                    phone, self.token2_featurizer.vocab_array) is not True:
                logging.info(
                    ' {} txt phone {} not all in tokens,continue'.format(
                        txt,
                        self.check_valid(phone,
                                         self.token2_featurizer.vocab_array)))
                continue
            #
            if self.check_valid(
                    py, self.token3_featurizer.vocab_array) is not True:
                logging.info(' {} txt py {} not all in tokens,continue'.format(
                    txt,
                    self.check_valid(py, self.token3_featurizer.vocab_array)))
                continue
            word_text_feature = self.token1_featurizer.extract(word)
            phone_text_feature = self.token2_featurizer.extract(phone)
            py_text_feature = self.token3_featurizer.extract(py)

            if in_len < len(word_text_feature):
                continue

            max_label_words = max(max_label_words, len(word_text_feature))
            max_label_phone = max(max_label_phone, len(phone_text_feature))
            max_label_py = max(max_label_py, len(py_text_feature))
            max_input = max(max_input, len(speech_feature))

            speech_features.append(speech_feature)
            input_length.append(in_len)
            words_label.append(np.array(word_text_feature))
            words_label_length.append(len(word_text_feature))

            phone_label.append(np.array(phone_text_feature))
            phone_label_length.append(len(phone_text_feature))

            py_label.append(np.array(py_text_feature))
            py_label_length.append(len(py_text_feature))
            sample.append(line)
            if len(sample) == batch:
                break
        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:

                    continue
                if len(data) < 400:
                    continue
                elif len(
                        data
                ) > self.speech_featurizer.sample_rate * self.speech_config[
                        'wav_max_duration']:

                    continue
                data = self.augment.process(data)
                if self.speech_config['only_chinese']:
                    txt = self.only_chinese(txt)
                if self.speech_config['use_mel_layer']:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] *
                        (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
                else:
                    speech_feature = self.speech_featurizer.extract(data)
                    in_len = int(speech_feature.shape[0] //
                                 self.speech_config['reduction_factor'])

                py, phone, word = self.map(txt)
                if len(py) == 0:
                    continue

                word_text_feature = self.token1_featurizer.extract(word)
                phone_text_feature = self.token2_featurizer.extract(phone)
                py_text_feature = self.token3_featurizer.extract(py)

                if in_len < len(word_text_feature):
                    continue

                max_label_words = max(max_label_words, len(word_text_feature))
                max_label_phone = max(max_label_phone, len(phone_text_feature))
                max_label_py = max(max_label_py, len(py_text_feature))
                max_input = max(max_input, len(speech_feature))

                speech_features.append(speech_feature)
                input_length.append(in_len)
                words_label.append(np.array(word_text_feature))
                words_label_length.append(len(word_text_feature))

                phone_label.append(np.array(phone_text_feature))
                phone_label_length.append(len(phone_text_feature))

                py_label.append(np.array(py_text_feature))
                py_label_length.append(len(py_text_feature))

        if self.speech_config['use_mel_layer']:
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        words_label = self.pad(words_label, max_label_words)
        phone_label = self.pad(phone_label, max_label_phone)
        py_label = self.pad(py_label, max_label_py)
        speech_features = np.array(speech_features, 'float32')
        words_label = np.array(words_label, 'int32')
        phone_label = np.array(phone_label, 'int32')
        py_label = np.array(py_label, 'int32')
        input_length = np.array(input_length, 'int32')
        words_label_length = np.array(words_label_length, 'int32')
        phone_label_length = np.array(phone_label_length, 'int32')
        py_label_length = np.array(py_label_length, 'int32')

        return speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length

    def generator(self, train=True):
        while 1:
            speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length = self.generate(
                train)

            yield speech_features, input_length, words_label, words_label_length, phone_label, phone_label_length, py_label, py_label_length

Example #6

Show file

class AM_DataLoader():
    def __init__(self, config_dict, training=True):
        self.speech_config = config_dict['speech_config']

        self.text_config = config_dict['decoder_config']
        self.augment_config = config_dict['augments_config']
        self.streaming = self.speech_config['streaming']
        self.chunk = self.speech_config['sample_rate'] * self.speech_config[
            'streaming_bucket']
        self.batch = config_dict['learning_config']['running_config'][
            'batch_size']
        self.speech_featurizer = SpeechFeaturizer(self.speech_config)
        self.text_featurizer = TextFeaturizer(self.text_config)
        self.make_file_list(
            self.speech_config['train_list']
            if training else self.speech_config['eval_list'], training)
        self.augment = Augmentation(self.augment_config)
        self.init_text_to_vocab()
        self.epochs = 1
        self.LAS = False
        self.steps = 0

    def load_state(self, outdir):
        try:

            dg_state = np.load(os.path.join(outdir, 'dg_state.npz'))

            self.epochs = int(dg_state['epoch'])
            self.train_offset = int(dg_state['train_offset'])
            train_list = dg_state['train_list'].tolist()
            if len(train_list) != len(self.train_list):
                logging.info(
                    'history train list not equal new load train list ,data loader use init state'
                )
                self.epochs = 0
                self.train_offset = 0
        except FileNotFoundError:
            logging.info('not found state file,init state')
        except:
            logging.info('load state falied,use init state')

    def save_state(self, outdir):

        np.savez(os.path.join(outdir, 'dg_state.npz'),
                 epoch=self.epochs,
                 train_offset=self.train_offset,
                 train_list=self.train_list)

    def return_data_types(self):
        if self.LAS:
            return (tf.float32, tf.int32, tf.int32, tf.int32, tf.float32)
        else:
            return (tf.float32, tf.int32, tf.int32, tf.int32)

    def return_data_shape(self):
        f, c = self.speech_featurizer.compute_feature_dim()
        if self.LAS:
            return (tf.TensorShape([None, None, 1])
                    if self.speech_config['use_mel_layer'] else
                    tf.TensorShape([None, None, f, c]), tf.TensorShape([
                        None,
                    ]), tf.TensorShape([None, None]), tf.TensorShape([
                        None,
                    ]), tf.TensorShape([None, None, None]))
        else:
            return (tf.TensorShape([None, None, 1])
                    if self.speech_config['use_mel_layer'] else
                    tf.TensorShape([None, None, f, c]), tf.TensorShape([
                        None,
                    ]), tf.TensorShape([None, None]), tf.TensorShape([
                        None,
                    ]))

    def get_per_epoch_steps(self):
        return len(self.train_list) // self.batch

    def eval_per_epoch_steps(self):
        return len(self.test_list) // self.batch

    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({
            '调大': [['tiáo'], ['dà']],
            '调小': [['tiáo'], ['xiǎo']],
            '调亮': [['tiáo'], ['liàng']],
            '调暗': [['tiáo'], ['àn']],
            '肖': [['xiāo']],
            '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
            '新传': [['xīn'], ['zhuàn']],
            '外传': [['wài'], ['zhuàn']],
            '正传': [['zhèng'], ['zhuàn']],
            '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
        })

        def text_to_vocab_func(txt):
            pins = pypinyin.pinyin(txt)
            pins = [i[0] for i in pins]
            return pins

        self.text_to_vocab = text_to_vocab_func

    def make_file_list(self, wav_list, training=True):
        with open(wav_list, encoding='utf-8') as f:
            data = f.readlines()
        data = [i.strip() for i in data if i != '']
        num = len(data)
        if training:
            self.train_list = data[:int(num * 0.99)]
            self.test_list = data[int(num * 0.99):]
            np.random.shuffle(self.train_list)
            self.train_offset = 0
            self.test_offset = 0
            logging.info('load train list {} test list{}'.format(
                len(self.train_list), len(self.test_list)))
        else:
            self.test_list = data
            self.offset = 0

    def only_chinese(self, word):
        txt = ''
        for ch in word:
            if '\u4e00' <= ch <= '\u9fff':
                txt += ch
            else:
                continue

        return txt

    def eval_data_generator(self):
        sample = self.test_list[self.offset:self.offset + self.batch]
        self.offset += self.batch
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []
        max_input = 0
        max_label1 = 0
        for i in sample:
            wp, txt = i.strip().split('\t')
            txt = txt.replace(' ', '')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                logging.info('{} wav too short < 25ms,skip'.format(wp))
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}) ,skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                if not self.streaming:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] *
                        (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
                else:
                    speech_feature = data
                    speech_feature = np.expand_dims(speech_feature, -1)
                    reduce = self.speech_config['reduction_factor'] * (
                        self.speech_featurizer.sample_rate /
                        1000) * self.speech_config['stride_ms']
                    in_len = len(speech_feature) // self.chunk
                    if len(speech_feature) % self.chunk != 0:
                        in_len += 1
                    chunk_times = self.chunk // reduce
                    if self.chunk % reduce != 0:
                        chunk_times += 1
                    in_len *= chunk_times

            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])
            max_input = max(max_input, speech_feature.shape[0])

            py = self.text_to_vocab(txt)
            if self.check_valid(py,
                                self.text_featurizer.vocab_array) is not True:
                logging.info(' {} txt pinyin {} not all in tokens,skip'.format(
                    txt, self.check_valid(py,
                                          self.text_featurizer.vocab_array)))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                logging.info(
                    '{} feature length < pinyin length,skip'.format(wp))
                continue
            max_input = max(max_input, len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            if self.streaming:
                max_input = max_input // self.chunk * self.chunk + self.chunk
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 -
                              y1[i].shape[0]) * self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1

    def check_valid(self, txt, vocab_list):
        if len(txt) == 0:
            return False
        for n in txt:
            if n in vocab_list:
                pass
            else:
                return n
        return True

    def GuidedAttentionMatrix(self, N, T, g=0.2):
        W = np.zeros((N, T), dtype=np.float32)
        for n in range(N):
            for t in range(T):
                W[n, t] = 1 - np.exp(-(t / float(T) - n / float(N))**2 /
                                     (2 * g * g))
        return W

    def guided_attention(self, input_length, targets_length, inputs_shape,
                         mel_target_shape):
        att_targets = []
        for i, j in zip(input_length, targets_length):
            i = int(i)
            step = int(j)
            pad = np.ones([inputs_shape, mel_target_shape]) * -1.
            pad[i:, :step] = 1
            att_target = self.GuidedAttentionMatrix(i, step, 0.2)
            pad[:att_target.shape[0], :att_target.shape[1]] = att_target
            att_targets.append(pad)
        att_targets = np.array(att_targets)

        return att_targets.astype('float32')

    def generate(self, train=True):

        sample = []
        speech_features = []
        input_length = []
        y1 = []
        label_length1 = []

        max_input = 0
        max_label1 = 0
        if train:
            batch = self.batch // 2 if self.augment.available() else self.batch
        else:
            batch = self.batch

        for i in range(batch * 10):
            if train:
                line = self.train_list[self.train_offset]
                self.train_offset += 1
                if self.train_offset > len(self.train_list) - 1:
                    self.train_offset = 0
                    np.random.shuffle(self.train_list)
                    self.epochs += 1
            else:
                line = self.test_list[self.test_offset]
                self.test_offset += 1
                if self.test_offset > len(self.test_list) - 1:
                    self.test_offset = 0
            wp, txt = line.strip().split('\t')
            try:
                data = self.speech_featurizer.load_wav(wp)
            except:
                logging.info('{} load data failed,skip'.format(wp))
                continue
            if len(data) < 400:
                continue
            elif len(
                    data
            ) > self.speech_featurizer.sample_rate * self.speech_config[
                    'wav_max_duration']:
                logging.info(
                    '{} duration out of wav_max_duration({}),skip'.format(
                        wp, self.speech_config['wav_max_duration']))
                continue
            if self.speech_config['only_chinese']:
                txt = self.only_chinese(txt)
            if self.speech_config['use_mel_layer']:
                if not self.streaming:
                    speech_feature = data / np.abs(data).max()
                    speech_feature = np.expand_dims(speech_feature, -1)
                    in_len = len(speech_feature) // (
                        self.speech_config['reduction_factor'] *
                        (self.speech_featurizer.sample_rate / 1000) *
                        self.speech_config['stride_ms'])
                else:
                    speech_feature = data
                    speech_feature = np.expand_dims(speech_feature, -1)
                    reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \
                             self.speech_config['stride_ms']
                    in_len = len(speech_feature) // self.chunk
                    if len(speech_feature) % self.chunk != 0:
                        in_len += 1
                    chunk_times = self.chunk // reduce
                    if self.chunk % reduce != 0:
                        chunk_times += 1
                    in_len *= chunk_times
            else:
                speech_feature = self.speech_featurizer.extract(data)
                in_len = int(speech_feature.shape[0] //
                             self.speech_config['reduction_factor'])

            py = self.text_to_vocab(txt)
            if self.check_valid(py,
                                self.text_featurizer.vocab_array) is not True:
                logging.info(
                    ' {} txt pinyin {} not all in tokens,continue'.format(
                        txt,
                        self.check_valid(py,
                                         self.text_featurizer.vocab_array)))
                continue
            text_feature = self.text_featurizer.extract(py)

            if in_len < len(text_feature):
                logging.info(
                    '{} feature length < pinyin length,continue'.format(wp))
                continue
            max_input = max(max_input, len(speech_feature))
            max_label1 = max(max_label1, len(text_feature))
            speech_features.append(speech_feature)
            input_length.append(in_len)
            y1.append(np.array(text_feature))
            label_length1.append(len(text_feature))
            sample.append(line)
            if len(sample) == batch:
                break
        if train and self.augment.available():
            for i in sample:
                wp, txt = i.strip().split('\t')
                try:
                    data = self.speech_featurizer.load_wav(wp)
                except:
                    continue
                if len(data) < 400:
                    logging.info('{} wav too short < 25ms,skip'.format(wp))
                    continue
                elif len(
                        data
                ) > self.speech_featurizer.sample_rate * self.speech_config[
                        'wav_max_duration']:
                    continue
                data = self.augment.process(data)
                if self.speech_config['only_chinese']:
                    txt = self.only_chinese(txt)
                if self.speech_config['use_mel_layer']:
                    if not self.streaming:
                        speech_feature = data / np.abs(data).max()
                        speech_feature = np.expand_dims(speech_feature, -1)
                        in_len = len(speech_feature) // (
                            self.speech_config['reduction_factor'] *
                            (self.speech_featurizer.sample_rate / 1000) *
                            self.speech_config['stride_ms'])
                    else:
                        speech_feature = data
                        speech_feature = np.expand_dims(speech_feature, -1)
                        reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \
                                 self.speech_config['stride_ms']
                        in_len = len(speech_feature) // self.chunk
                        if len(speech_feature) % self.chunk != 0:
                            in_len += 1
                        chunk_times = self.chunk // reduce
                        if self.chunk % reduce != 0:
                            chunk_times += 1
                        in_len *= chunk_times
                else:
                    speech_feature = self.speech_featurizer.extract(data)
                    in_len = int(speech_feature.shape[0] //
                                 self.speech_config['reduction_factor'])

                py = self.text_to_vocab(txt)
                if not self.check_valid(py, self.text_featurizer.vocab_array):
                    continue

                text_feature = self.text_featurizer.extract(py)

                if in_len < len(text_feature):
                    continue
                max_input = max(max_input, len(speech_feature))
                max_label1 = max(max_label1, len(text_feature))
                speech_features.append(speech_feature)

                input_length.append(in_len)
                y1.append(np.array(text_feature))
                label_length1.append(len(text_feature))

        if self.speech_config['use_mel_layer']:
            if self.streaming:
                reduce = self.speech_config['reduction_factor'] * (self.speech_featurizer.sample_rate / 1000) * \
                         self.speech_config['stride_ms']
                max_input = max_input // self.chunk * self.chunk + self.chunk
                max_in_len = max_input // self.chunk
                chunk_times = self.chunk // reduce
                if self.chunk % reduce != 0:
                    chunk_times += 1
                max_in_len *= chunk_times
                input_length = np.clip(input_length, 0, max_in_len)
            speech_features = self.speech_featurizer.pad_signal(
                speech_features, max_input)

        else:
            for i in range(len(speech_features)):

                if speech_features[i].shape[0] < max_input:
                    pad = np.ones([
                        max_input - speech_features[i].shape[0],
                        speech_features[i].shape[1],
                        speech_features[i].shape[2]
                    ]) * speech_features[i].min()
                    speech_features[i] = np.vstack((speech_features[i], pad))

        for i in range(len(y1)):
            if y1[i].shape[0] < max_label1:
                pad = np.ones(max_label1 -
                              y1[i].shape[0]) * self.text_featurizer.pad
                y1[i] = np.hstack((y1[i], pad))

        x = np.array(speech_features, 'float32')
        y1 = np.array(y1, 'int32')

        input_length = np.array(input_length, 'int32')
        label_length1 = np.array(label_length1, 'int32')

        return x, input_length, y1, label_length1

    def generator(self, train=True):
        while 1:
            x, input_length, labels, label_length = self.generate(train)
            if x.shape[0] == 0:
                logging.info('load data length zero,continue')
                continue
            if self.LAS:
                guide_matrix = self.guided_attention(input_length,
                                                     label_length,
                                                     np.max(input_length),
                                                     label_length.max())
                yield x, input_length, labels, label_length, guide_matrix
            else:
                yield x, input_length, labels, label_length