Example #1
0
    def get_mel(self, filename):
        if not self.load_mel_from_disk:
            audio, sampling_rate = load_wav_to_torch(filename)
            if sampling_rate != self.stft.sampling_rate:
                raise ValueError("{} {} SR doesn't match target {} SR".format(
                    sampling_rate, self.stft.sampling_rate))
            audio_norm = audio / self.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
        else:
            #melspec = torch.from_numpy(np.load(filename))
            melspec=torch.load(filename)
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return melspec
    def get_mel_audio_pair(self, filename):
        audio, sampling_rate = load_wav_to_torch(filename, sr = self.sampling_rate)

        if sampling_rate != self.stft.sampling_rate:
            raise ValueError(f"{sampling_rate} SR doesn't match target {self.stft.sampling_rate} SR")

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start+self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)), 'constant').data

        audio = audio / self.max_wav_value
        audio_norm = audio.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = melspec.squeeze(0)

        return (melspec, audio, len(audio))
Example #3
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_training_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model, args = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    os.makedirs(args.output_dir, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [
        os.path.join(args.dataset_path, anchor)
        for anchor in args.training_anchor_dirs
    ]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
                        args.n_mel_channels, args.sampling_rate, args.mel_fmin,
                        args.mel_fmax)
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for npy_path, text in tqdm(metadata):
                seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
                seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
                seq_lens = torch.IntTensor([len(text)])
                wav = load_wav_to_torch(npy_path)
                mel = stft.mel_spectrogram(wav.unsqueeze(0))
                mel = mel.squeeze()
                max_target_len = mel.size(1) - 1
                max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step
                padded_mel = np.pad(mel, [(0, 0),
                                          (0, max_target_len - mel.size(1))],
                                    mode='constant',
                                    constant_values=args.mel_pad_val)
                target = padded_mel[:, ::args.n_frames_per_step]
                targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
                target_lengths = torch.IntTensor([target.shape[1]])
                outputs = model.infer(
                    to_gpu(seqs).long(),
                    to_gpu(seq_lens).int(),
                    to_gpu(targets).half(),
                    to_gpu(target_lengths).int())
                _, mel_out, _, _ = [
                    output.cpu() for output in outputs if output is not None
                ]
                mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1]
                assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length)
                fname = os.path.basename(npy_path)
                np.save(os.path.join(args.output_dir, fname),
                        mel_out,
                        allow_pickle=False)
                # GTA synthesis
                # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze())
                # wav = griffin_lim(magnitudes, stft.stft_fn, 60)
                # save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
Example #4
0
def audio2mel2audio(dataset_path,
                    audiopaths_and_text,
                    melpaths_and_text,
                    args,
                    use_intermed=None):

    melpaths_and_text_list = \
        load_filepaths_and_text(dataset_path, melpaths_and_text)

    audiopaths_and_text_list = \
        load_filepaths_and_text(dataset_path, audiopaths_and_text)

    # n = 10
    # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}")
    # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}")

    # torchaudio implementation
    spec = T.Spectrogram(
            n_fft=args.filter_length,
            win_length=args.win_length,
            hop_length=args.hop_length,
            power=1,
            normalized=True,

        )
    # print(spec)

    griffin_lim = T.GriffinLim(
            n_fft=args.filter_length,
            win_length=args.win_length,
            hop_length=args.hop_length,
            n_iter=args.n_iters,
            power=1,
            normalized=True,
        )
    # import pdb; pdb.set_trace()
    print(args)
    data_path = "/data/logotypografia_simple/cleaned_wavs/"

    #  tacotron-based implementations
    # stft_fn = STFT(args.filter_length, args.hop_length, args.win_length)
    # data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args)
    for i in range(len(melpaths_and_text_list)):
        # specotrogram calculation based on internal components, buggy
        # spec = data_loader.get_spec(audiopaths_and_text_list[i][0])
        # wave = griffin_lim(spec, stft_fn, n_iters=30)
        # wave = wave.detach().cpu().numpy()

        #  spectrogram calculation based on torchaudio
        wav_name = data_path + audiopaths_and_text_list[i][0].split("/")[-1]

        audio, sampling_rate = load_wav_to_torch(wav_name)

        _spectrogram = spec(audio)
        inv_waveform = griffin_lim(_spectrogram)

        # torch.save(mel, f"grifin_lin/{}")
        inv_wav_name = "griffin_lim_inv_audio_custom7/" \
                       + audiopaths_and_text_list[i][0].split("/")[-1]
        print(f"Saving reconstructed wav with name {inv_wav_name}")
        write(inv_wav_name, 16000, inv_waveform.detach().cpu().numpy())