Beispiel #1
0
def style_transfer_v2():
    audio_paths_ = 'data/examples_filelist_v2.txt'
    dataloader_ = TextMelLoader(audio_paths_, hparams)
    datacollate_ = TextMelCollate(1)
    ## Load data
    # for file_idx in range(10):
    #     audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    #     print(dict(file_idx=file_idx, audio_path=audio_path, text=text))

    file_idx = 8
    audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid))

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cuda()
    pitch_contour = dataloader_[file_idx][3][None].cuda()
    mel = load_mel(audio_path)

    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]]))
    ipd.Audio(audio_path, rate=hparams.sampling_rate)

    # Style Transfer (Rhythm and Pitch Contour)
    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)
    speaker_id = next(female_speakers) if np.random.randint(2) else next(
        male_speakers)
    speaker_id = torch.LongTensor([speaker_id]).cuda()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour, rhythm))

    plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                          mel_outputs_postnet.data.cpu().numpy()[0],
                          pitch_contour.data.cpu().numpy()[0, 0],
                          rhythm.data.cpu().numpy()[:, 0].T)
    plt.show()

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
    t0 = time.time()
    # wav = aukit.inv_mel_spectrogram()
    out_wav = infer_waveform_melgan(out_mel)
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)

    t0 = time.time()
    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8),
                         0.01)[:, 0]
    ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    out_wav = audio[0].data.cpu().numpy()
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)
Beispiel #2
0
def singing_voice_v2():
    # Singing Voice from Music Score
    data = get_data_from_musicxml('data/sinsy/csongdb_f00002_000_en.musicxml', 132, convert_stress=True)
    panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}
    n_speakers_per_part = 4
    frequency_scaling = 0.4
    n_seconds = 90
    audio_stereo = np.zeros((hparams.sampling_rate * n_seconds, 2), dtype=np.float32)
    for i, (part, v) in enumerate(data.items()):
        rhythm = data[part]['rhythm'].cuda()
        pitch_contour = data[part]['pitch_contour'].cuda()
        text_encoded = data[part]['text_encoded'].cuda()

        for k in range(n_speakers_per_part):
            pan = k
            # pan = np.random.randint(panning[part][0], panning[part][1])
            if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
                speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
            else:
                speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
            print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

            with torch.no_grad():
                mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                    (text_encoded, mel, speaker_id, pitch_contour * frequency_scaling, rhythm))

            plot_mel_f0_alignment(mel_outputs_postnet.data.cpu().numpy()[0],
                                  mel_outputs_postnet.data.cpu().numpy()[0],
                                  pitch_contour.data.cpu().numpy()[0, 0],
                                  rhythm.data.cpu().numpy()[:, 0].T)
            plt.show()

            out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
            t0 = time.time()
            # wav = aukit.inv_mel_spectrogram()
            out_wav = infer_waveform_melgan(out_mel)
            print(time.time() - t0)

            aukit.save_wav(out_wav, "logs/musicxml_melgan_{}.wav".format(time.strftime("%Y%m%d-%H%M%S")), sr=22050)
            aukit.play_audio(out_wav, sr=22050)

            t0 = time.time()
            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            print(time.time() - t0)

            audio_stereo[:audio.shape[0]] += audio
            write("logs/{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)
            out_wav = audio

            aukit.play_audio(out_wav, sr=22050)
def change_speed_one(kwargs: dict):
    inpath = kwargs.get("inpath")
    outpath = kwargs.get("outpath")
    rate = kwargs.get("rate")
    if Path(outpath).exists() and os.path.getsize(outpath) > 8000:
        return
    Path(outpath).parent.mkdir(exist_ok=True, parents=True)
    hp = Dict2Obj()
    hp.update(melgan_hparams)
    hp.update({"hop_size": int(melgan_hparams["hop_size"] * rate)})

    try:
        wav = aukit.load_wav(inpath, sr=_sr)
        mel = wav2mel(wav, hparams=hp)
        out = infer_waveform_melgan(mel, load_path=_melgan_load_path)
        aukit.save_wav(out, outpath, sr=_sr)
    except Exception as e:
        print(e)
        print(kwargs)
    return kwargs
Beispiel #4
0
def style_transfer():
    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))
    ipd.Audio(audio_path, rate=hparams.sampling_rate)

    # Style Transfer (Rhythm and Pitch Contour)
    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)
    speaker_id = next(female_speakers) if np.random.randint(2) else next(
        male_speakers)
    speaker_id = torch.LongTensor([speaker_id]).cuda()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour, rhythm))

    plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                          mel_outputs_postnet.data.cpu().numpy()[0],
                          pitch_contour.data.cpu().numpy()[0, 0],
                          rhythm.data.cpu().numpy()[:, 0].T)
    plt.show()

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]

    # wav = aukit.inv_mel_spectrogram()
    out_wav = infer_waveform_melgan(out_mel)

    aukit.play_audio(out_wav, sr=22050)

    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8),
                         0.01)[:, 0]
    ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    out_wav = audio[0].data.cpu().numpy()

    aukit.play_audio(out_wav, sr=22050)
Beispiel #5
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        wav = None
        vocname = ""
        if self.ui.current_vocoder_fpath is not None:
            model_fpath = self.ui.current_vocoder_fpath
            vocname = Path(model_fpath).parent.stem
            if Path(model_fpath).parent.stem == "melgan":
                self.ui.log("Waveform generation with MelGAN... ")
                wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath)

            elif Path(model_fpath).parent.stem == "wavernn":
                self.ui.log("Waveform generation with WaveRNN... ")
                wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)

        if wav is None:
            vocname = "griffinlim"
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = self.ui.selected_utterance.name
        ftime = '{}'.format(time_formatter())
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fvoc = vocname
        fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext))
        audio.save_wav(wav, self._out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_{}".format(time_formatter())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)