def parse_batch(self, batch): audio, attention_contexts, encoder_outputs, text_lengths, durations = batch audio = to_gpu(audio).float() attention_contexts = to_gpu(attention_contexts).float() encoder_outputs = to_gpu(encoder_outputs).float() text_lengths = to_gpu(text_lengths).long() durations = to_gpu(durations).long() return (audio, attention_contexts, encoder_outputs, text_lengths, durations)
def calculate_global_mean(data_loader, global_mean_npy, hparams): if global_mean_npy and os.path.exists(global_mean_npy): global_mean = np.load(global_mean_npy) return to_gpu(torch.tensor(global_mean).half()) if hparams.fp16_run else to_gpu(torch.tensor(global_mean).float()) sums = [] frames = [] print('calculating global mean...') for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), smoothing=0.001): # padded values are 0. sums.append(batch['gt_mel'].double().sum(dim=(0, 2))) frames.append(batch['mel_lengths'].double().sum()) if i > 100: break global_mean = sum(sums) / sum(frames) global_mean = to_gpu(global_mean.half()) if hparams.fp16_run else to_gpu(global_mean.float()) if global_mean_npy: np.save(global_mean_npy, global_mean.cpu().numpy()) return global_mean
def calculate_global_mean(data_loader, global_mean_npy, hparams): if global_mean_npy and os.path.exists(global_mean_npy): global_mean = np.load(global_mean_npy) return to_gpu(torch.tensor(global_mean).half()) if hparams.fp16_run else to_gpu(torch.tensor(global_mean).float()) sums = [] frames = [] print('calculating global mean...') for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), smoothing=0.001): text_padded, input_lengths, mel_padded, gate_padded,\ output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states = batch # padded values are 0. sums.append(mel_padded.double().sum(dim=(0, 2))) frames.append(output_lengths.double().sum()) global_mean = sum(sums) / sum(frames) global_mean = to_gpu(global_mean.half()) if hparams.fp16_run else to_gpu(global_mean.float()) if global_mean_npy: np.save(global_mean_npy, global_mean.cpu().numpy()) return global_mean
def parse_batch(self, batch): text_padded, text_lengths, mel_padded, gate_padded, \ output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states = batch text_padded = to_gpu(text_padded).long() text_lengths = to_gpu(text_lengths).long() output_lengths = to_gpu(output_lengths).long() speaker_ids = to_gpu(speaker_ids.data).long() mel_padded = to_gpu(mel_padded).float() max_len = torch.max(text_lengths.data).item() # used by loss func gate_padded = to_gpu(gate_padded).float() # used by loss func if torchmoji_hidden is not None: torchmoji_hidden = to_gpu(torchmoji_hidden).float() if preserve_decoder_states is not None: preserve_decoder_states = to_gpu(preserve_decoder_states).float() return ( (text_padded, text_lengths, mel_padded, max_len, output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states), (mel_padded, gate_padded, output_lengths, text_lengths))
def parse_batch(self, batch): text_padded, mel_padded, speaker_ids, text_lengths, output_lengths,\ alignments, torchmoji_hidden, perc_loudness, f0, energy, sylps,\ voiced_mask, char_f0, char_voiced, char_energy = batch text_padded = to_gpu(text_padded).long() mel_padded = to_gpu(mel_padded).float() speaker_ids = to_gpu(speaker_ids.data).long() text_lengths = to_gpu(text_lengths).long() output_lengths = to_gpu(output_lengths).long() alignments = to_gpu(alignments).float() if torchmoji_hidden is not None: torchmoji_hidden = to_gpu(torchmoji_hidden).float() perc_loudness = to_gpu(perc_loudness).float() f0 = to_gpu(f0).float() energy = to_gpu(energy).float() sylps = to_gpu(sylps).float() voiced_mask = to_gpu(voiced_mask).bool() char_f0 = to_gpu(char_f0).float() char_voiced = to_gpu(char_voiced).float() char_energy = to_gpu(char_energy).float() return ((text_padded, mel_padded, speaker_ids, text_lengths, output_lengths, alignments, torchmoji_hidden, perc_loudness, f0, energy, sylps, voiced_mask, char_f0, char_voiced, char_energy), (mel_padded, text_lengths, output_lengths, perc_loudness, f0, energy, sylps, voiced_mask, char_f0, char_voiced, char_energy))