def synthesize(model, input_data, force_cpu=False):

    item = input_data.split('|')
    clean_text = item[1]

    if not hp.use_punctuation:
        clean_text = text.remove_punctuation(clean_text)
    if not hp.case_sensitive:
        clean_text = text.to_lower(clean_text)
    if hp.remove_multiple_wspaces:
        clean_text = text.remove_odd_whitespaces(clean_text)

    t = torch.LongTensor(
        text.to_sequence(clean_text, use_phonemes=hp.use_phonemes))

    if hp.multi_language:
        # item[3]: l1-(len1),l2*0.75:l3*0.25-(len2),l1
        # l_tokens: list, [l1-(len1),l2*0.75:l3*0.25-(len2),l1]
        l_tokens = item[3].split(',')
        t_length = len(clean_text) + 1
        # 输出的l为一维向量,长度为language_num(language dim for every token)*token_num
        l = []
        for token in l_tokens:
            # l_d: [l2*0.75:l3*0.25,(len2)]
            l_d = token.split('-')
            # language: [0,0,...,0]
            language = [0] * hp.language_number
            for l_cw in l_d[0].split(':'):
                # l_cw: l2*0.75 / l3*0.25
                # l_cw_s: list, [l2,0.75]
                l_cw_s = l_cw.split('*')
                language[hp.languages.index(
                    l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1])

            # language: [0,0.75,0.25,...,0]
            # language_length: int, (len2). 指定该语种覆盖的长度,或者默认剩下所有的长度
            language_length = (int(l_d[1]) if len(l_d) == 2 else t_length)
            # l: list。对每一个token对应一个language: [0,0.75,0.25,...,0]
            l += [language] * language_length
            t_length -= language_length
        l = torch.FloatTensor([l])
    else:
        l = None

    # s: [int],仅有一个元素的向量
    s = torch.LongTensor([hp.unique_speakers.index(item[2])
                          ]) if hp.multi_speaker else None

    if torch.cuda.is_available() and not force_cpu:
        t = t.cuda(non_blocking=True)
        if l is not None: l = l.cuda(non_blocking=True)
        if s is not None: s = s.cuda(non_blocking=True)

    # s:仅有一个speaker_id元素的向量
    # l:元素个数为language_num*token_num的向量
    s = model.inference(t, speaker=s, language=l).cpu().detach().numpy()
    s = audio.denormalize_spectrogram(s, not hp.predict_linear)

    return s
Esempio n. 2
0
def synthesize(model, input_data, force_cpu=False):

    item = input_data.split('|')
    print(item)
    clean_text = item[1]

    if not hp.use_punctuation:
        clean_text = text.remove_punctuation(clean_text)
    if not hp.case_sensitive:
        clean_text = text.to_lower(clean_text)
    if hp.remove_multiple_wspaces:
        clean_text = text.remove_odd_whitespaces(clean_text)

    t = torch.LongTensor(
        text.to_sequence(clean_text, use_phonemes=hp.use_phonemes))

    if hp.multi_language:
        l_tokens = item[3].split(',')
        t_length = len(clean_text) + 1
        l = []
        for token in l_tokens:
            l_d = token.split('-')

            language = [0] * hp.language_number
            for l_cw in l_d[0].split(':'):
                l_cw_s = l_cw.split('*')
                language[hp.languages.index(
                    l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1])

            language_length = (int(l_d[1]) if len(l_d) == 2 else t_length)
            l += [language] * language_length
            t_length -= language_length
        l = torch.FloatTensor([l])
    else:
        l = None

    s = torch.LongTensor([hp.unique_speakers.index(item[2])
                          ]) if hp.multi_speaker else None

    if torch.cuda.is_available() and not force_cpu:
        t = t.cuda(non_blocking=True)
        if l is not None: l = l.cuda(non_blocking=True)
        if s is not None: s = s.cuda(non_blocking=True)

    s = model.inference(t, speaker=s, language=l).cpu().detach().numpy()
    s = audio.denormalize_spectrogram(s, not hp.predict_linear)

    return s
Esempio n. 3
0
def evaluate(epoch, data, model, criterion):
    """Main evaluation procedure.
    
    Arguments:
        epoch -- current epoch 
        data -- DataLoader which can provide validation batches
        model -- model to be evaluated
        criterion -- instance of loss function to measure performance
    """

    model.eval()

    # initialize counters, etc.
    mcd, mcd_count = 0, 0
    cla, cla_count = 0, 0
    eval_losses = {}

    # loop through epoch batches
    with torch.no_grad():
        for i, batch in enumerate(data):

            # parse batch
            batch = list(map(to_gpu, batch))
            src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch

            # run the model (twice, with and without teacher forcing)
            post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model(
                src, src_len, trg_mel, trg_len, spkrs, langs, 1.0)
            post_pred_0, _, stop_pred_0, alignment_0, _, _ = model(
                src, src_len, trg_mel, trg_len, spkrs, langs, 0.0)
            stop_pred_probs = torch.sigmoid(stop_pred_0)

            # evaluate loss function
            post_trg = trg_lin if hp.predict_linear else trg_mel
            classifier = model._reversal_classifier if hp.reversal_classifier else None
            loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel,
                                           post_pred, post_trg, stop_pred,
                                           stop_trg, alignment, spkrs,
                                           spkrs_pred, enc_output, classifier)

            # compute mel cepstral distorsion
            for j, (gen, ref, stop) in enumerate(
                    zip(post_pred_0, trg_mel, stop_pred_probs)):
                stop_idxes = np.where(stop.cpu().numpy() > 0.5)[0]
                stop_idx = min(
                    np.min(stop_idxes) + hp.stop_frames,
                    gen.size()[1]) if len(stop_idxes) > 0 else gen.size()[1]
                gen = gen[:, :stop_idx].data.cpu().numpy()
                ref = ref[:, :trg_len[j]].data.cpu().numpy()
                if hp.normalize_spectrogram:
                    gen = audio.denormalize_spectrogram(
                        gen, not hp.predict_linear)
                    ref = audio.denormalize_spectrogram(ref, True)
                if hp.predict_linear: gen = audio.linear_to_mel(gen)
                mcd = (mcd_count * mcd + audio.mel_cepstral_distorision(
                    gen, ref, 'dtw')) / (mcd_count + 1)
                mcd_count += 1

            # compute adversarial classifier accuracy
            if hp.reversal_classifier:
                input_mask = lengths_to_mask(src_len)
                trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64)
                for s in range(hp.speaker_number):
                    speaker_mask = (spkrs == s)
                    trg_spkrs[speaker_mask] = s
                matches = (trg_spkrs == torch.argmax(
                    torch.nn.functional.softmax(spkrs_pred, dim=-1), dim=-1))
                matches[~input_mask] = False
                cla = (cla_count * cla + torch.sum(matches).item() /
                       torch.sum(input_mask).item()) / (cla_count + 1)
                cla_count += 1

            # add batch losses to epoch losses
            for k, v in batch_losses.items():
                eval_losses[k] = v + eval_losses[k] if k in eval_losses else v

    # normalize loss per batch
    for k in eval_losses.keys():
        eval_losses[k] /= len(data)

    # log evaluation
    Logger.evaluation(epoch + 1, eval_losses, mcd, src_len, trg_len, src,
                      post_trg, post_pred, post_pred_0, stop_pred_probs,
                      stop_trg, alignment_0, cla)

    return sum(eval_losses.values())
Esempio n. 4
0
    def evaluation(eval_step, losses, mcd, source_len, target_len, source,
                   target, prediction_forced, prediction, stop_prediction,
                   stop_target, alignment, classifier):
        """Log evaluation results.
        
        Arguments:
            eval_step -- number of the current evaluation step (i.e. epoch)
            losses (dictionary of {loss name, value})-- dictionary with values of batch losses
            mcd (float) -- evaluation Mel Cepstral Distorsion
            source_len (tensor) -- number of characters of input utterances
            target_len (tensor) -- number of frames of ground-truth spectrograms
            source (tensor) -- input utterances
            target (tensor) -- ground-truth spectrograms
            prediction_forced (tensor) -- ground-truth-aligned spectrograms
            prediction (tensor) -- predicted spectrograms
            stop_prediction (tensor) -- predicted stop token probabilities
            stop_target (tensor) -- true stop token probabilities
            alignment (tensor) -- alignments (attention weights for each frame) of the last evaluation batch
            classifier (float) -- accuracy of the reversal classifier
        """

        # log losses
        total_loss = sum(losses.values())
        Logger._sw.add_scalar(f'Eval/loss_total', total_loss, eval_step)
        for n, l in losses.items():
            Logger._sw.add_scalar(f'Eval/loss_{n}', l, eval_step)

        # show random sample: spectrogram, stop token probability, alignment and audio
        idx = random.randint(0, alignment.size(0) - 1)
        predicted_spec = prediction[
            idx, :, :target_len[idx]].data.cpu().numpy()
        f_predicted_spec = prediction_forced[
            idx, :, :target_len[idx]].data.cpu().numpy()
        target_spec = target[idx, :, :target_len[idx]].data.cpu().numpy()

        # log spectrograms
        if hp.normalize_spectrogram:
            predicted_spec = audio.denormalize_spectrogram(
                predicted_spec, not hp.predict_linear)
            f_predicted_spec = audio.denormalize_spectrogram(
                f_predicted_spec, not hp.predict_linear)
            target_spec = audio.denormalize_spectrogram(
                target_spec, not hp.predict_linear)
        Logger._sw.add_figure(f"Predicted/generated",
                              Logger._plot_spectrogram(predicted_spec),
                              eval_step)
        Logger._sw.add_figure(f"Predicted/forced",
                              Logger._plot_spectrogram(f_predicted_spec),
                              eval_step)
        Logger._sw.add_figure(f"Target/eval",
                              Logger._plot_spectrogram(target_spec), eval_step)

        # log audio
        waveform = audio.inverse_spectrogram(predicted_spec,
                                             not hp.predict_linear)
        Logger._sw.add_audio(f"Audio/generated",
                             waveform,
                             eval_step,
                             sample_rate=hp.sample_rate)
        waveform = audio.inverse_spectrogram(f_predicted_spec,
                                             not hp.predict_linear)
        Logger._sw.add_audio(f"Audio/forced",
                             waveform,
                             eval_step,
                             sample_rate=hp.sample_rate)

        # log alignment
        alignment = alignment[
            idx, :target_len[idx], :source_len[idx]].data.cpu().numpy().T
        Logger._sw.add_figure(f"Alignment/eval",
                              Logger._plot_alignment(alignment), eval_step)

        # log source text
        utterance = text.to_text(
            source[idx].data.cpu().numpy()[:source_len[idx]], hp.use_phonemes)
        Logger._sw.add_text(f"Text/eval", utterance, eval_step)

        # log stop tokens
        Logger._sw.add_figure(
            f"Stop/eval",
            Logger._plot_stop_tokens(stop_target[idx].data.cpu().numpy(),
                                     stop_prediction[idx].data.cpu().numpy()),
            eval_step)

        # log mel cepstral distorsion
        Logger._sw.add_scalar(f'Eval/mcd', mcd, eval_step)

        # log reversal language classifier accuracy
        if hp.reversal_classifier:
            Logger._sw.add_scalar(f'Eval/classifier', classifier, eval_step)
Esempio n. 5
0
        data = DataLoader(dataset.train,
                          batch_size=args.batch_size,
                          drop_last=False,
                          shuffle=False,
                          collate_fn=TextToSpeechCollate(True),
                          num_workers=args.loader_workers)

    with torch.no_grad():
        serial_number = 0
        for i, batch in enumerate(data):

            batch = list(map(to_gpu, batch))
            src, src_len, trg_mel, _, trg_len, _, spkrs, langs = batch

            # Run the model with enbaled teacher forcing (1.0)
            predictions = model(src, src_len, trg_mel, trg_len, spkrs, langs,
                                1.0)
            prediction = predictions[0].data.cpu().numpy()

            for idx in range(len(prediction)):
                speaker = spkrs[idx] if spkrs is not None else 0
                mel = prediction[idx, :, :trg_len[idx]]
                if hp.normalize_spectrogram:
                    mel = audio.denormalize_spectrogram(
                        mel, not hp.predict_linear)
                np.save(os.path.join(output_dir,
                                     f'{serial_number:05}-{speaker}.npy'),
                        mel,
                        allow_pickle=False)
                serial_number += 1