def synthesize(model, input_data, force_cpu=False): item = input_data.split('|') clean_text = item[1] if not hp.use_punctuation: clean_text = text.remove_punctuation(clean_text) if not hp.case_sensitive: clean_text = text.to_lower(clean_text) if hp.remove_multiple_wspaces: clean_text = text.remove_odd_whitespaces(clean_text) t = torch.LongTensor( text.to_sequence(clean_text, use_phonemes=hp.use_phonemes)) if hp.multi_language: # item[3]: l1-(len1),l2*0.75:l3*0.25-(len2),l1 # l_tokens: list, [l1-(len1),l2*0.75:l3*0.25-(len2),l1] l_tokens = item[3].split(',') t_length = len(clean_text) + 1 # 输出的l为一维向量,长度为language_num(language dim for every token)*token_num l = [] for token in l_tokens: # l_d: [l2*0.75:l3*0.25,(len2)] l_d = token.split('-') # language: [0,0,...,0] language = [0] * hp.language_number for l_cw in l_d[0].split(':'): # l_cw: l2*0.75 / l3*0.25 # l_cw_s: list, [l2,0.75] l_cw_s = l_cw.split('*') language[hp.languages.index( l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1]) # language: [0,0.75,0.25,...,0] # language_length: int, (len2). 指定该语种覆盖的长度,或者默认剩下所有的长度 language_length = (int(l_d[1]) if len(l_d) == 2 else t_length) # l: list。对每一个token对应一个language: [0,0.75,0.25,...,0] l += [language] * language_length t_length -= language_length l = torch.FloatTensor([l]) else: l = None # s: [int],仅有一个元素的向量 s = torch.LongTensor([hp.unique_speakers.index(item[2]) ]) if hp.multi_speaker else None if torch.cuda.is_available() and not force_cpu: t = t.cuda(non_blocking=True) if l is not None: l = l.cuda(non_blocking=True) if s is not None: s = s.cuda(non_blocking=True) # s:仅有一个speaker_id元素的向量 # l:元素个数为language_num*token_num的向量 s = model.inference(t, speaker=s, language=l).cpu().detach().numpy() s = audio.denormalize_spectrogram(s, not hp.predict_linear) return s
def synthesize(model, input_data, force_cpu=False): item = input_data.split('|') print(item) clean_text = item[1] if not hp.use_punctuation: clean_text = text.remove_punctuation(clean_text) if not hp.case_sensitive: clean_text = text.to_lower(clean_text) if hp.remove_multiple_wspaces: clean_text = text.remove_odd_whitespaces(clean_text) t = torch.LongTensor( text.to_sequence(clean_text, use_phonemes=hp.use_phonemes)) if hp.multi_language: l_tokens = item[3].split(',') t_length = len(clean_text) + 1 l = [] for token in l_tokens: l_d = token.split('-') language = [0] * hp.language_number for l_cw in l_d[0].split(':'): l_cw_s = l_cw.split('*') language[hp.languages.index( l_cw_s[0])] = 1 if len(l_cw_s) == 1 else float(l_cw_s[1]) language_length = (int(l_d[1]) if len(l_d) == 2 else t_length) l += [language] * language_length t_length -= language_length l = torch.FloatTensor([l]) else: l = None s = torch.LongTensor([hp.unique_speakers.index(item[2]) ]) if hp.multi_speaker else None if torch.cuda.is_available() and not force_cpu: t = t.cuda(non_blocking=True) if l is not None: l = l.cuda(non_blocking=True) if s is not None: s = s.cuda(non_blocking=True) s = model.inference(t, speaker=s, language=l).cpu().detach().numpy() s = audio.denormalize_spectrogram(s, not hp.predict_linear) return s
def evaluate(epoch, data, model, criterion): """Main evaluation procedure. Arguments: epoch -- current epoch data -- DataLoader which can provide validation batches model -- model to be evaluated criterion -- instance of loss function to measure performance """ model.eval() # initialize counters, etc. mcd, mcd_count = 0, 0 cla, cla_count = 0, 0 eval_losses = {} # loop through epoch batches with torch.no_grad(): for i, batch in enumerate(data): # parse batch batch = list(map(to_gpu, batch)) src, src_len, trg_mel, trg_lin, trg_len, stop_trg, spkrs, langs = batch # run the model (twice, with and without teacher forcing) post_pred, pre_pred, stop_pred, alignment, spkrs_pred, enc_output = model( src, src_len, trg_mel, trg_len, spkrs, langs, 1.0) post_pred_0, _, stop_pred_0, alignment_0, _, _ = model( src, src_len, trg_mel, trg_len, spkrs, langs, 0.0) stop_pred_probs = torch.sigmoid(stop_pred_0) # evaluate loss function post_trg = trg_lin if hp.predict_linear else trg_mel classifier = model._reversal_classifier if hp.reversal_classifier else None loss, batch_losses = criterion(src_len, trg_len, pre_pred, trg_mel, post_pred, post_trg, stop_pred, stop_trg, alignment, spkrs, spkrs_pred, enc_output, classifier) # compute mel cepstral distorsion for j, (gen, ref, stop) in enumerate( zip(post_pred_0, trg_mel, stop_pred_probs)): stop_idxes = np.where(stop.cpu().numpy() > 0.5)[0] stop_idx = min( np.min(stop_idxes) + hp.stop_frames, gen.size()[1]) if len(stop_idxes) > 0 else gen.size()[1] gen = gen[:, :stop_idx].data.cpu().numpy() ref = ref[:, :trg_len[j]].data.cpu().numpy() if hp.normalize_spectrogram: gen = audio.denormalize_spectrogram( gen, not hp.predict_linear) ref = audio.denormalize_spectrogram(ref, True) if hp.predict_linear: gen = audio.linear_to_mel(gen) mcd = (mcd_count * mcd + audio.mel_cepstral_distorision( gen, ref, 'dtw')) / (mcd_count + 1) mcd_count += 1 # compute adversarial classifier accuracy if hp.reversal_classifier: input_mask = lengths_to_mask(src_len) trg_spkrs = torch.zeros_like(input_mask, dtype=torch.int64) for s in range(hp.speaker_number): speaker_mask = (spkrs == s) trg_spkrs[speaker_mask] = s matches = (trg_spkrs == torch.argmax( torch.nn.functional.softmax(spkrs_pred, dim=-1), dim=-1)) matches[~input_mask] = False cla = (cla_count * cla + torch.sum(matches).item() / torch.sum(input_mask).item()) / (cla_count + 1) cla_count += 1 # add batch losses to epoch losses for k, v in batch_losses.items(): eval_losses[k] = v + eval_losses[k] if k in eval_losses else v # normalize loss per batch for k in eval_losses.keys(): eval_losses[k] /= len(data) # log evaluation Logger.evaluation(epoch + 1, eval_losses, mcd, src_len, trg_len, src, post_trg, post_pred, post_pred_0, stop_pred_probs, stop_trg, alignment_0, cla) return sum(eval_losses.values())
def evaluation(eval_step, losses, mcd, source_len, target_len, source, target, prediction_forced, prediction, stop_prediction, stop_target, alignment, classifier): """Log evaluation results. Arguments: eval_step -- number of the current evaluation step (i.e. epoch) losses (dictionary of {loss name, value})-- dictionary with values of batch losses mcd (float) -- evaluation Mel Cepstral Distorsion source_len (tensor) -- number of characters of input utterances target_len (tensor) -- number of frames of ground-truth spectrograms source (tensor) -- input utterances target (tensor) -- ground-truth spectrograms prediction_forced (tensor) -- ground-truth-aligned spectrograms prediction (tensor) -- predicted spectrograms stop_prediction (tensor) -- predicted stop token probabilities stop_target (tensor) -- true stop token probabilities alignment (tensor) -- alignments (attention weights for each frame) of the last evaluation batch classifier (float) -- accuracy of the reversal classifier """ # log losses total_loss = sum(losses.values()) Logger._sw.add_scalar(f'Eval/loss_total', total_loss, eval_step) for n, l in losses.items(): Logger._sw.add_scalar(f'Eval/loss_{n}', l, eval_step) # show random sample: spectrogram, stop token probability, alignment and audio idx = random.randint(0, alignment.size(0) - 1) predicted_spec = prediction[ idx, :, :target_len[idx]].data.cpu().numpy() f_predicted_spec = prediction_forced[ idx, :, :target_len[idx]].data.cpu().numpy() target_spec = target[idx, :, :target_len[idx]].data.cpu().numpy() # log spectrograms if hp.normalize_spectrogram: predicted_spec = audio.denormalize_spectrogram( predicted_spec, not hp.predict_linear) f_predicted_spec = audio.denormalize_spectrogram( f_predicted_spec, not hp.predict_linear) target_spec = audio.denormalize_spectrogram( target_spec, not hp.predict_linear) Logger._sw.add_figure(f"Predicted/generated", Logger._plot_spectrogram(predicted_spec), eval_step) Logger._sw.add_figure(f"Predicted/forced", Logger._plot_spectrogram(f_predicted_spec), eval_step) Logger._sw.add_figure(f"Target/eval", Logger._plot_spectrogram(target_spec), eval_step) # log audio waveform = audio.inverse_spectrogram(predicted_spec, not hp.predict_linear) Logger._sw.add_audio(f"Audio/generated", waveform, eval_step, sample_rate=hp.sample_rate) waveform = audio.inverse_spectrogram(f_predicted_spec, not hp.predict_linear) Logger._sw.add_audio(f"Audio/forced", waveform, eval_step, sample_rate=hp.sample_rate) # log alignment alignment = alignment[ idx, :target_len[idx], :source_len[idx]].data.cpu().numpy().T Logger._sw.add_figure(f"Alignment/eval", Logger._plot_alignment(alignment), eval_step) # log source text utterance = text.to_text( source[idx].data.cpu().numpy()[:source_len[idx]], hp.use_phonemes) Logger._sw.add_text(f"Text/eval", utterance, eval_step) # log stop tokens Logger._sw.add_figure( f"Stop/eval", Logger._plot_stop_tokens(stop_target[idx].data.cpu().numpy(), stop_prediction[idx].data.cpu().numpy()), eval_step) # log mel cepstral distorsion Logger._sw.add_scalar(f'Eval/mcd', mcd, eval_step) # log reversal language classifier accuracy if hp.reversal_classifier: Logger._sw.add_scalar(f'Eval/classifier', classifier, eval_step)
data = DataLoader(dataset.train, batch_size=args.batch_size, drop_last=False, shuffle=False, collate_fn=TextToSpeechCollate(True), num_workers=args.loader_workers) with torch.no_grad(): serial_number = 0 for i, batch in enumerate(data): batch = list(map(to_gpu, batch)) src, src_len, trg_mel, _, trg_len, _, spkrs, langs = batch # Run the model with enbaled teacher forcing (1.0) predictions = model(src, src_len, trg_mel, trg_len, spkrs, langs, 1.0) prediction = predictions[0].data.cpu().numpy() for idx in range(len(prediction)): speaker = spkrs[idx] if spkrs is not None else 0 mel = prediction[idx, :, :trg_len[idx]] if hp.normalize_spectrogram: mel = audio.denormalize_spectrogram( mel, not hp.predict_linear) np.save(os.path.join(output_dir, f'{serial_number:05}-{speaker}.npy'), mel, allow_pickle=False) serial_number += 1