Ejemplo n.º 1
0
def generate_audio(answer):
    sentences = [answer]
    print(sentences)
    spectograms = [
        synthesize(model_taco, "|" + s + ACCENT) for s in sentences
        if len(s) > 0
    ]
    return [
        audio.inverse_spectrogram(_s, not hp.predict_linear)
        for _s in spectograms
    ]
Ejemplo n.º 2
0
    def evaluation(eval_step, losses, mcd, source_len, target_len, source,
                   target, prediction_forced, prediction, stop_prediction,
                   stop_target, alignment, classifier):
        """Log evaluation results.
        
        Arguments:
            eval_step -- number of the current evaluation step (i.e. epoch)
            losses (dictionary of {loss name, value})-- dictionary with values of batch losses
            mcd (float) -- evaluation Mel Cepstral Distorsion
            source_len (tensor) -- number of characters of input utterances
            target_len (tensor) -- number of frames of ground-truth spectrograms
            source (tensor) -- input utterances
            target (tensor) -- ground-truth spectrograms
            prediction_forced (tensor) -- ground-truth-aligned spectrograms
            prediction (tensor) -- predicted spectrograms
            stop_prediction (tensor) -- predicted stop token probabilities
            stop_target (tensor) -- true stop token probabilities
            alignment (tensor) -- alignments (attention weights for each frame) of the last evaluation batch
            classifier (float) -- accuracy of the reversal classifier
        """

        # log losses
        total_loss = sum(losses.values())
        Logger._sw.add_scalar(f'Eval/loss_total', total_loss, eval_step)
        for n, l in losses.items():
            Logger._sw.add_scalar(f'Eval/loss_{n}', l, eval_step)

        # show random sample: spectrogram, stop token probability, alignment and audio
        idx = random.randint(0, alignment.size(0) - 1)
        predicted_spec = prediction[
            idx, :, :target_len[idx]].data.cpu().numpy()
        f_predicted_spec = prediction_forced[
            idx, :, :target_len[idx]].data.cpu().numpy()
        target_spec = target[idx, :, :target_len[idx]].data.cpu().numpy()

        # log spectrograms
        if hp.normalize_spectrogram:
            predicted_spec = audio.denormalize_spectrogram(
                predicted_spec, not hp.predict_linear)
            f_predicted_spec = audio.denormalize_spectrogram(
                f_predicted_spec, not hp.predict_linear)
            target_spec = audio.denormalize_spectrogram(
                target_spec, not hp.predict_linear)
        Logger._sw.add_figure(f"Predicted/generated",
                              Logger._plot_spectrogram(predicted_spec),
                              eval_step)
        Logger._sw.add_figure(f"Predicted/forced",
                              Logger._plot_spectrogram(f_predicted_spec),
                              eval_step)
        Logger._sw.add_figure(f"Target/eval",
                              Logger._plot_spectrogram(target_spec), eval_step)

        # log audio
        waveform = audio.inverse_spectrogram(predicted_spec,
                                             not hp.predict_linear)
        Logger._sw.add_audio(f"Audio/generated",
                             waveform,
                             eval_step,
                             sample_rate=hp.sample_rate)
        waveform = audio.inverse_spectrogram(f_predicted_spec,
                                             not hp.predict_linear)
        Logger._sw.add_audio(f"Audio/forced",
                             waveform,
                             eval_step,
                             sample_rate=hp.sample_rate)

        # log alignment
        alignment = alignment[
            idx, :target_len[idx], :source_len[idx]].data.cpu().numpy().T
        Logger._sw.add_figure(f"Alignment/eval",
                              Logger._plot_alignment(alignment), eval_step)

        # log source text
        utterance = text.to_text(
            source[idx].data.cpu().numpy()[:source_len[idx]], hp.use_phonemes)
        Logger._sw.add_text(f"Text/eval", utterance, eval_step)

        # log stop tokens
        Logger._sw.add_figure(
            f"Stop/eval",
            Logger._plot_stop_tokens(stop_target[idx].data.cpu().numpy(),
                                     stop_prediction[idx].data.cpu().numpy()),
            eval_step)

        # log mel cepstral distorsion
        Logger._sw.add_scalar(f'Eval/mcd', mcd, eval_step)

        # log reversal language classifier accuracy
        if hp.reversal_classifier:
            Logger._sw.add_scalar(f'Eval/classifier', classifier, eval_step)
Ejemplo n.º 3
0
                        help="Does not save waveforms if set.")
    args = parser.parse_args()

    print("Building model ...")

    model = build_model(args.checkpoint, args.cpu)
    model.eval()

    #total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    #print(f"Builded model with {total_params} parameters")

    inputs = [l.rstrip() for l in sys.stdin.readlines() if l]

    spectrograms = []
    for i, item in enumerate(inputs):
        print(f'Synthesizing({i+1}/{len(inputs)}): "{item}"')

        id = item.split("|")[0]

        s = synthesize(model, item, args.cpu)

        if not os.path.exists(args.output):
            os.makedirs(args.output)

        if args.save_spec:
            np.save(os.path.join(args.output, f'{id}.npy'), s)

        if not args.ignore_wav:
            w = audio.inverse_spectrogram(s, not hp.predict_linear)
            audio.save(w, os.path.join(args.output, f'{id}.wav'))