Ejemplo n.º 1
0
def run_tuner():
    import aukit
    from aukit.audio_tuner import tune_speed, tune_pitch
    inpath = r"hello.wav"
    aukit.anything2bytes(inpath)
    aukit.anything2wav(inpath)
    aukit.anything2bytesio(inpath)
    bys = tune_speed(inpath, sr=16000, rate=0.5, out_type=None)
    print(bys)
    wav = tune_pitch(bys, sr=16000, rate=1, out_type=None)
    print(wav)
    aukit.play_audio(wav)
Ejemplo n.º 2
0
def tts_sdk_base(text, speaker='biaobei', audio='24', output='', **kwargs):
    """语音合成函数式SDK接口。
    text为待合成的文本。
    speaker可设置为内置的发音人名称,可选名称见_reference_audio_dict;默认的发音人名称列表见resource/reference_audio/__init__.py。
    audio如果是数字,则调用内置的语音作为发音人参考音频;如果是语音路径,则调用audio路径的语音作为发音人参考音频。
    output如果以.wav结尾,则为保存语音文件的路径;如果以play开头,则合成语音后自动播放语音。
    """
    global _dataloader
    if _dataloader is None:
        load_models(**kwargs)
        load_audio(**kwargs)

    if str(audio).isdigit():
        audio = _reference_audio_list[(int(audio) - 1) %
                                      len(_reference_audio_list)]
    elif os.path.isfile(audio):
        audio = str(audio)
    elif isinstance(audio, bytes):
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(audio)
        audio = tmp_audio.name
    elif isinstance(audio, str) and len(audio) >= 256:
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(base64.standard_b64decode(audio))
        audio = tmp_audio.name
    elif speaker in _reference_audio_dict:
        audio = _reference_audio_dict[speaker]
    else:
        raise AssertionError
    text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
        dataloader=_dataloader,
        text=text,
        speaker=speaker,
        audio=audio,
        device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    out_gate = gates.cpu().numpy()[0]
    end_idx = np.argmax(out_gate > kwargs.get('gate_threshold', 0.2)
                        ) or np.argmax(out_gate) or out_gate.shape[0]

    mels_postnet = mels_postnet[:, :, :end_idx]
    vocoder_name = kwargs.get('vocoder', 'waveglow')
    if vocoder_name == 'waveglow':
        wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs)
    else:
        wavs = _stft.griffin_lim(mels_postnet, n_iters=10)

    wav_output = wavs.squeeze(0).cpu().numpy()

    if output.startswith('play'):
        aukit.play_sound(wav_output, sr=_stft.sampling_rate)
    if output.endswith('.wav'):
        aukit.save_wav(wav_output, output, sr=_stft.sampling_rate)
    wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate)
    return wav_output
Ejemplo n.º 3
0
def tts_sdk(text, speaker='biaobei', audio='0', **kwargs):
    global _dataloader
    if _dataloader is None:
        load_models(**kwargs)

    if str(audio).isdigit():
        audio = _reference_audio_list[(int(audio) - 1) %
                                      len(_reference_audio_list)]
    elif os.path.isfile(audio):
        audio = str(audio)
    elif isinstance(audio, bytes):
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(audio)
        audio = tmp_audio.name
    elif isinstance(audio, str) and len(audio) >= 100:
        tmp_audio = tempfile.TemporaryFile(suffix='.wav')
        tmp_audio.write(base64.standard_b64decode(audio))
        audio = tmp_audio.name
    elif speaker in _reference_audio_dict:
        audio = _reference_audio_dict[speaker]
    else:
        raise AssertionError
    text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
        dataloader=_dataloader,
        text=text,
        speaker=speaker,
        audio=audio,
        device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    out_gate = gates.cpu().numpy()[0]
    end_idx = np.argmax(
        out_gate > 0.2) or np.argmax(out_gate) or out_gate.shape[0]

    mels_postnet = mels_postnet[:, :, :end_idx]
    if _use_waveglow:
        wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs)
    else:
        wavs = _stft.griffin_lim(mels_postnet, n_iters=5)

    wav_output = wavs.squeeze(0).cpu().numpy()

    output = kwargs.get('output', '')
    if output.startswith('play'):
        aukit.play_sound(wav_output, sr=_stft.sampling_rate)
    if output.endswith('.wav'):
        aukit.save_wav(wav_output, output, sr=_stft.sampling_rate)
    wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate)
    return wav_output