Beispiel #1
0
        def infer(outputs, i):
            predicted_final_specs = outputs[1]
            sequence_lengths = outputs[4]
            for j in range(len(predicted_final_specs)):
                predicted_final_spec = predicted_final_specs[j]
                audio_length = sequence_lengths[j]

                if audio_length > 2:
                    if "both" in model.get_data_layer().params['output_type']:
                        predicted_mag_spec = outputs[5][j][:audio_length -
                                                           1, :]
                    else:
                        predicted_final_spec = predicted_final_spec[:
                                                                    audio_length
                                                                    - 1, :]
                        predicted_mag_spec = model.get_data_layer(
                        ).get_magnitude_spec(predicted_final_spec, is_mel=True)
                    save_audio(
                        predicted_mag_spec,
                        syn_save_dir,
                        0,
                        n_fft=model.get_data_layer().n_fft,
                        sampling_rate=model.get_data_layer().sampling_rate,
                        mode="syn",
                        number=i * batch_size + j,
                        save_format="disk",
                        gl_iters=4,
                        verbose=False)
                else:
                    print(
                        "WARNING: An audio file was not saved, this will error out in"
                        "future steps")
def infer(line):
    print("Input English")
    print(line)
    
    # Generate speech
    results = get_interactive_infer_results(model_T2S, sess, model_in=[line])
    audio_length = results[1][4][0]

    if model_T2S.get_data_layer()._both:
        prediction = results[1][5][0]

    else:
        prediction = results[1][1][0]

    prediction = prediction[:audio_length-1,:]
    mag_prediction = model_T2S.get_data_layer().get_magnitude_spec(prediction)

    mag_prediction_squared = np.clip(mag_prediction, a_min=0, a_max=255)
    mag_prediction_squared = mag_prediction_squared**1.5
    mag_prediction_squared = np.square(mag_prediction_squared)

    mel_basis = librosa.filters.mel(sr=22050, n_fft=1024, n_mels=80, htk=True, norm=None)
    mel = np.dot(mel_basis, mag_prediction_squared.T)
    mel = np.log(np.clip(mel, a_min=1e-5, a_max=None))
    np.save("spec2", mel)

    plt.imshow(mel)
    plt.gca().invert_yaxis()
    plt.show()

    wav = save_audio(mag_prediction, "unused", "unused", sampling_rate=sampling_rate, save_format="np.array", n_fft=n_fft)
    audio = IPython.display.Audio(wav, rate=sampling_rate)
    print("Generated Audio")
    IPython.display.display(audio)
Beispiel #3
0
def infer(line):
    print("Input English")
    print(line)

    # Generate speech
    results = get_t2s_interactive_infer_results(model_T2S, model_in=[line])
    audio_length = results[1][4][0]

    if model_T2S.get_data_layer()._both:
        prediction = results[1][5][0]

    else:
        prediction = results[1][1][0]

    prediction = prediction[:audio_length - 1, :]
    mag_prediction = model_T2S.get_data_layer().get_magnitude_spec(prediction)

    wav = save_audio(mag_prediction,
                     "unused",
                     "unused",
                     sampling_rate=sampling_rate,
                     save_format="np.array",
                     n_fft=n_fft)
    wav = librosa.core.resample(wav, sampling_rate, 16000)
    print("Generated Audio")

    # Generate text
    results = get_s2t_interactive_infer_results(model_S2T, model_in=[wav])
    english_recognized = results[0][0]

    print("Recognized Speech")
    print(english_recognized)

    # Generate translation
    encoded_src_list = sp1.EncodeAsPieces(english_recognized)
    encoded_src = ' '.join([w for w in encoded_src_list])

    results = get_t2t_interactive_infer_results(model_T2T,
                                                model_in=[encoded_src])
    encoded_tgt = results[1][0]
    decoded_tgt = sp2.DecodePieces(encoded_tgt.split(" "))

    print("Translation")
    print(decoded_tgt)
def infer(line):
    print("Input English")
    print(line)

    # Generate speech
    results = get_interactive_infer_results(model_T2S, model_in=[line])
    audio_length = results[1][4][0]

    if model_T2S.get_data_layer()._both:
        prediction = results[1][5][0]

    else:
        prediction = results[1][1][0]

    prediction = prediction[:audio_length - 1, :]
    mag_prediction = model_T2S.get_data_layer().get_magnitude_spec(prediction)

    mag_prediction_squared = np.clip(mag_prediction, a_min=0, a_max=255)
    mag_prediction_squared = mag_prediction_squared**1.5
    mag_prediction_squared = np.square(mag_prediction_squared)

    mel_basis = librosa.filters.mel(sr=22050,
                                    n_fft=1024,
                                    n_mels=80,
                                    htk=True,
                                    norm=None)
    mel = np.dot(mel_basis, mag_prediction_squared.T)
    mel = np.log(np.clip(mel, a_min=1e-5, a_max=None))
    np.save("spec2", mel)

    wav = save_audio(mag_prediction,
                     "/home/oscar/filesys/tacotron-LJ-float/checkpoint/",
                     "1",
                     sampling_rate=sampling_rate,
                     save_format="disk",
                     n_fft=n_fft)
    print("Generated Audio")