Example #1
0
 def __call__(self, examples):
     batch_size = len(examples)
     mels = [example[0] for example in examples]
     wavs = [example[1] for example in examples]
     mels = batch_spec(mels, pad_value=self.padding_value)
     wavs = batch_wav(wavs, pad_value=self.padding_value)
     audio_starts = np.zeros((batch_size, ), dtype=np.int64)
     return mels, wavs, audio_starts
    def __call__(self, examples):
        ids = [example[0] for example in examples]
        mels = [example[1] for example in examples]
        stop_probs = [example[2] for example in examples]

        ids = batch_text_id(ids, pad_id=self.padding_idx)
        mels = batch_spec(mels, pad_value=self.padding_value)
        stop_probs = batch_text_id(stop_probs, pad_id=self.padding_idx)
        return ids, np.transpose(mels, [0, 2, 1]), stop_probs
Example #3
0
 def __call__(self, examples):
     mels = []
     wavs = []
     starts = []
     for example in examples:
         mel, wav_clip, start = self.clip(example)
         mels.append(mel)
         wavs.append(wav_clip)
         starts.append(start)
     mels = batch_spec(mels)
     wavs = np.stack(wavs)
     starts = np.array(starts, dtype=np.int64)
     return mels, wavs, starts
    def __call__(self, examples):
        texts = []
        mels = []
        text_lens = []
        mel_lens = []
        stop_tokens = []
        for data in examples:
            text, mel = data
            text = np.array(text, dtype=np.int64)
            text_lens.append(len(text))
            mels.append(mel)
            texts.append(text)
            mel_lens.append(mel.shape[1])
            stop_token = np.zeros([mel.shape[1] - 1], dtype=np.float32)
            stop_tokens.append(np.append(stop_token, 1.0))

        # Sort by text_len in descending order
        texts = [
            i for i, _ in sorted(
                zip(texts, text_lens), key=lambda x: x[1], reverse=True)
        ]
        mels = [
            i for i, _ in sorted(
                zip(mels, text_lens), key=lambda x: x[1], reverse=True)
        ]

        mel_lens = [
            i for i, _ in sorted(
                zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
        ]

        stop_tokens = [
            i for i, _ in sorted(
                zip(stop_tokens, text_lens), key=lambda x: x[1], reverse=True)
        ]

        text_lens = sorted(text_lens, reverse=True)

        # Pad sequence with largest len of the batch
        texts = batch_text_id(texts, pad_id=self.padding_idx)
        mels = np.transpose(batch_spec(mels, pad_value=self.padding_value),
                            axes=(0, 2, 1))
        stop_tokens = batch_text_id(stop_tokens,
                                    pad_id=self.padding_stop_token,
                                    dtype=mels[0].dtype)

        return (texts, mels, text_lens, mel_lens, stop_tokens)
Example #5
0
    def __call__(self, samples):
        # transform them first
        if self.valid:
            samples = [(audio, mel_spectrogram, 0)
                       for audio, mel_spectrogram in samples]
        else:
            samples = [self.random_crop(sample) for sample in samples]
        # batch them
        audios = [sample[0] for sample in samples]
        audio_starts = [sample[2] for sample in samples]
        mels = [sample[1] for sample in samples]

        mels = batch_spec(mels)

        if self.valid:
            audios = batch_wav(audios, dtype=np.float32)
        else:
            audios = np.array(audios, dtype=np.float32)
        audio_starts = np.array(audio_starts, dtype=np.int64)
        return audios, mels, audio_starts
Example #6
0
 def __call__(self, examples):
     mels = [example[0] for example in examples]
     wavs = [example[1] for example in examples]
     mels = batch_spec(mels, pad_value=self.padding_value)
     wavs = batch_wav(wavs, pad_value=self.padding_value)
     return mels, wavs