コード例 #1
0
def style_transfer_v2():
    audio_paths_ = 'data/examples_filelist_v2.txt'
    dataloader_ = TextMelLoader(audio_paths_, hparams)
    datacollate_ = TextMelCollate(1)
    ## Load data
    # for file_idx in range(10):
    #     audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    #     print(dict(file_idx=file_idx, audio_path=audio_path, text=text))

    file_idx = 8
    audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid))

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cuda()
    pitch_contour = dataloader_[file_idx][3][None].cuda()
    mel = load_mel(audio_path)

    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]]))
    ipd.Audio(audio_path, rate=hparams.sampling_rate)

    # Style Transfer (Rhythm and Pitch Contour)
    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)
    speaker_id = next(female_speakers) if np.random.randint(2) else next(
        male_speakers)
    speaker_id = torch.LongTensor([speaker_id]).cuda()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour, rhythm))

    plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                          mel_outputs_postnet.data.cpu().numpy()[0],
                          pitch_contour.data.cpu().numpy()[0, 0],
                          rhythm.data.cpu().numpy()[:, 0].T)
    plt.show()

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
    t0 = time.time()
    # wav = aukit.inv_mel_spectrogram()
    out_wav = infer_waveform_melgan(out_mel)
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)

    t0 = time.time()
    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8),
                         0.01)[:, 0]
    ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    out_wav = audio[0].data.cpu().numpy()
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)
コード例 #2
0
ファイル: inference.py プロジェクト: zxy2020/zhrtvc
def run_compare():
    args = parse_args()
    load_vocoder_melgan(args.load_path)
    for i, fname in tqdm(enumerate(args.folder.glob("*.wav"))):
        wav, sr = librosa.core.load(fname, sr=16000)
        mel = wav2mel(wav)
        out = infer_waveform_melgan(mel=mel)
        aukit.play_audio(wav, sr=sr)
        aukit.play_audio(out, sr=sr)
コード例 #3
0
def run_tuner():
    import aukit
    from aukit.audio_tuner import tune_speed, tune_pitch
    inpath = r"hello.wav"
    aukit.anything2bytes(inpath)
    aukit.anything2wav(inpath)
    aukit.anything2bytesio(inpath)
    bys = tune_speed(inpath, sr=16000, rate=0.5, out_type=None)
    print(bys)
    wav = tune_pitch(bys, sr=16000, rate=1, out_type=None)
    print(wav)
    aukit.play_audio(wav)
コード例 #4
0
ファイル: inference.py プロジェクト: trinco-cn1/zhrtvc
def singing_voice_v2():
    # Singing Voice from Music Score
    data = get_data_from_musicxml('data/sinsy/csongdb_f00002_000_en.musicxml', 132, convert_stress=True)
    panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}
    n_speakers_per_part = 4
    frequency_scaling = 0.4
    n_seconds = 90
    audio_stereo = np.zeros((hparams.sampling_rate * n_seconds, 2), dtype=np.float32)
    for i, (part, v) in enumerate(data.items()):
        rhythm = data[part]['rhythm'].cuda()
        pitch_contour = data[part]['pitch_contour'].cuda()
        text_encoded = data[part]['text_encoded'].cuda()

        for k in range(n_speakers_per_part):
            pan = k
            # pan = np.random.randint(panning[part][0], panning[part][1])
            if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
                speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
            else:
                speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
            print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

            with torch.no_grad():
                mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                    (text_encoded, mel, speaker_id, pitch_contour * frequency_scaling, rhythm))

            plot_mel_f0_alignment(mel_outputs_postnet.data.cpu().numpy()[0],
                                  mel_outputs_postnet.data.cpu().numpy()[0],
                                  pitch_contour.data.cpu().numpy()[0, 0],
                                  rhythm.data.cpu().numpy()[:, 0].T)
            plt.show()

            out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
            t0 = time.time()
            # wav = aukit.inv_mel_spectrogram()
            out_wav = infer_waveform_melgan(out_mel)
            print(time.time() - t0)

            aukit.save_wav(out_wav, "logs/musicxml_melgan_{}.wav".format(time.strftime("%Y%m%d-%H%M%S")), sr=22050)
            aukit.play_audio(out_wav, sr=22050)

            t0 = time.time()
            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            print(time.time() - t0)

            audio_stereo[:audio.shape[0]] += audio
            write("logs/{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)
            out_wav = audio

            aukit.play_audio(out_wav, sr=22050)
コード例 #5
0
def style_transfer():
    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))
    ipd.Audio(audio_path, rate=hparams.sampling_rate)

    # Style Transfer (Rhythm and Pitch Contour)
    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)
    speaker_id = next(female_speakers) if np.random.randint(2) else next(
        male_speakers)
    speaker_id = torch.LongTensor([speaker_id]).cuda()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour, rhythm))

    plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                          mel_outputs_postnet.data.cpu().numpy()[0],
                          pitch_contour.data.cpu().numpy()[0, 0],
                          rhythm.data.cpu().numpy()[:, 0].T)
    plt.show()

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]

    # wav = aukit.inv_mel_spectrogram()
    out_wav = infer_waveform_melgan(out_mel)

    aukit.play_audio(out_wav, sr=22050)

    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8),
                         0.01)[:, 0]
    ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    out_wav = audio[0].data.cpu().numpy()

    aukit.play_audio(out_wav, sr=22050)
コード例 #6
0
            fig_path = out_dir.joinpath("demo_{}_{}_fig.jpg".format(
                cur_time, cur_text))
            plot_mel_alignment_gate_audio(
                spec, align, gate, wav[::msyner.stft.sampling_rate // 1000])
            plt.savefig(fig_path)
            plt.close()

            yml_path = out_dir.joinpath("demo_{}_{}_info.yml".format(
                cur_time, cur_text))
            info_dict = locals2dict(locals())
            with open(yml_path, 'wt', encoding='utf8') as fout:
                yaml.dump(info_dict,
                          fout,
                          default_flow_style=False,
                          encoding='utf-8',
                          allow_unicode=True)

            txt_path = out_dir.joinpath("info_dict.txt".format(cur_time))
            with open(txt_path, 'at', encoding='utf8') as fout:
                fout.write('{}\n'.format(
                    json.dumps(info_dict, ensure_ascii=False)))

            num_generated += 1
            # print("\nSaved output as %s\n\n" % out_path)
            if args.play:
                aukit.play_audio(out_path, sr=msyner.stft.sampling_rate)
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
            traceback.print_exc()
コード例 #7
0
ファイル: audio.py プロジェクト: zxy2020/zhrtvc
melgan_hparams = {}
melgan_hparams.update({k: v for k, v in default_hparams.items()})
melgan_hparams.update(my_hp)
melgan_hparams = Dict2Obj(melgan_hparams)

_pad_len = (default_hparams.n_fft - default_hparams.hop_size) // 2


def melspectrogram(wav, hparams=None):
    wav = np.pad(wav.flatten(), (_pad_len, _pad_len), mode="reflect")
    mel = mel_spectrogram(wav, melgan_hparams)
    mel = mel / 20
    return mel


def inv_melspectrogram(mel, hparams=None):
    mel = mel * 20
    wav = inv_mel_spectrogram(mel, melgan_hparams)
    return wav


if __name__ == "__main__":
    import aukit

    inpath = r"E:\data\temp\01.wav"
    wav = load_wav(inpath, sr=16000)
    mel = melspectrogram(wav)
    out = inv_melspectrogram(mel)
    aukit.play_audio(wav)
    aukit.play_audio(out)
コード例 #8
0
def run_player():
    import aukit
    inpath = Path(r"E:\data\aliaudio\examples\ali_Aibao_000001.wav")
    wav = aukit.load_wav(inpath, sr=16000)
    wav = aukit.change_voice(wav, mode="assign_pitch", alpha=200)
    aukit.play_audio(wav, volume=0.5)
コード例 #9
0
def run_noise_remover():
    import aukit
    inpath = r"hello.wav"
    wav = aukit.load_wav(inpath)
    out = aukit.remove_noise(wav)
    aukit.play_audio(out)
コード例 #10
0
ファイル: mm_inference.py プロジェクト: Georgehappy1/zhrtvc
            # plt.imsave(fpath, spec)
            plt.pcolor(spec)
            plt.colorbar()
            plt.savefig(fpath)
            plt.close()

            fpath = args.out_dir.joinpath(
                "demo_out_{}_alignment.jpg".format(cur_time))
            plt.pcolor(align)
            plt.colorbar()
            plt.savefig(fpath)
            plt.close()

            txt_path = args.out_dir.joinpath("info_dict.txt".format(cur_time))
            with open(txt_path, 'at', encoding='utf8') as fout:
                dt = dict(text=text,
                          audio_path=str(fpath),
                          speaker=speaker,
                          time=cur_time)
                out = json.dumps(dt, ensure_ascii=False)
                fout.write('{}\n'.format(out))

            num_generated += 1
            print("\nSaved output as %s\n\n" % outpath)
            if args.play:
                aukit.play_audio(fpath, sr=16000)
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
            traceback.print_exc()
コード例 #11
0
            spec = msyner.synthesize(text=text, speaker=speaker)
            # spec, align = synthesize_one(text, speaker=speaker, with_alignment=True,
            #                              hparams=_hparams, encoder_fpath=args.encoder_model_fpath)
            print("Spectrogram shape: {}".format(spec.shape))
            # print("Alignment shape: {}".format(align.shape))
            ## Generating the waveform
            print("Synthesizing the waveform ...")
            wav = griffinlim_vocoder(spec)
            print("Waveform shape: {}".format(wav.shape))

            # Save it on the disk
            cur_time = time.strftime('%Y%m%d_%H%M%S')
            fpath = args.out_dir.joinpath("demo_out_{}.wav".format(cur_time))
            # librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate)
            aukit.save_wav(wav, fpath, sr=_hparams.sampling_rate)  # save

            txt_path = args.out_dir.joinpath("info_dict.txt".format(cur_time))
            with open(txt_path, 'at', encoding='utf8') as fout:
                dt = dict(text=text, audio_path=str(fpath), speaker=speaker, time=cur_time)
                out = json.dumps(dt, ensure_ascii=False)
                fout.write('{}\n'.format(out))

            num_generated += 1
            print("\nSaved output as %s\n\n" % fpath)
            if args.play:
                aukit.play_audio(fpath, sr=_hparams.sampling_rate)
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
            traceback.print_exc()
コード例 #12
0
def run_noise_remover():
    import aukit
    inpath = r"E:\data\temp\01.wav"
    wav = aukit.load_wav(inpath)
    out = aukit.remove_noise(wav)
    aukit.play_audio(out)