Ejemplo n.º 1
0
 def parse_batch(self, batch):
     audio, attention_contexts, encoder_outputs, text_lengths, durations = batch
     audio = to_gpu(audio).float()
     attention_contexts = to_gpu(attention_contexts).float()
     encoder_outputs = to_gpu(encoder_outputs).float()
     text_lengths = to_gpu(text_lengths).long()
     durations = to_gpu(durations).long()
     return (audio, attention_contexts, encoder_outputs, text_lengths,
             durations)
Ejemplo n.º 2
0
def calculate_global_mean(data_loader, global_mean_npy, hparams):
    if global_mean_npy and os.path.exists(global_mean_npy):
        global_mean = np.load(global_mean_npy)
        return to_gpu(torch.tensor(global_mean).half()) if hparams.fp16_run else to_gpu(torch.tensor(global_mean).float())
    sums = []
    frames = []
    print('calculating global mean...')
    for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), smoothing=0.001):
        # padded values are 0.
        sums.append(batch['gt_mel'].double().sum(dim=(0, 2)))
        frames.append(batch['mel_lengths'].double().sum())
        if i > 100:
            break
    global_mean = sum(sums) / sum(frames)
    global_mean = to_gpu(global_mean.half()) if hparams.fp16_run else to_gpu(global_mean.float())
    if global_mean_npy:
        np.save(global_mean_npy, global_mean.cpu().numpy())
    return global_mean
Ejemplo n.º 3
0
def calculate_global_mean(data_loader, global_mean_npy, hparams):
    if global_mean_npy and os.path.exists(global_mean_npy):
        global_mean = np.load(global_mean_npy)
        return to_gpu(torch.tensor(global_mean).half()) if hparams.fp16_run else to_gpu(torch.tensor(global_mean).float())
    sums = []
    frames = []
    print('calculating global mean...')
    for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), smoothing=0.001):
        text_padded, input_lengths, mel_padded, gate_padded,\
            output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states = batch
        # padded values are 0.
        sums.append(mel_padded.double().sum(dim=(0, 2)))
        frames.append(output_lengths.double().sum())
    global_mean = sum(sums) / sum(frames)
    global_mean = to_gpu(global_mean.half()) if hparams.fp16_run else to_gpu(global_mean.float())
    if global_mean_npy:
        np.save(global_mean_npy, global_mean.cpu().numpy())
    return global_mean
Ejemplo n.º 4
0
 def parse_batch(self, batch):
     text_padded, text_lengths, mel_padded, gate_padded, \
         output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states = batch
     text_padded = to_gpu(text_padded).long()
     text_lengths = to_gpu(text_lengths).long()
     output_lengths = to_gpu(output_lengths).long()
     speaker_ids = to_gpu(speaker_ids.data).long()
     mel_padded = to_gpu(mel_padded).float()
     max_len = torch.max(text_lengths.data).item() # used by loss func
     gate_padded = to_gpu(gate_padded).float() # used by loss func
     if torchmoji_hidden is not None:
         torchmoji_hidden = to_gpu(torchmoji_hidden).float()
     if preserve_decoder_states is not None:
         preserve_decoder_states = to_gpu(preserve_decoder_states).float()
     return (
         (text_padded, text_lengths, mel_padded, max_len, output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states),
         (mel_padded, gate_padded, output_lengths, text_lengths))
Ejemplo n.º 5
0
    def parse_batch(self, batch):
        text_padded, mel_padded, speaker_ids, text_lengths, output_lengths,\
                 alignments, torchmoji_hidden, perc_loudness, f0, energy, sylps,\
                 voiced_mask, char_f0, char_voiced, char_energy = batch
        text_padded = to_gpu(text_padded).long()
        mel_padded = to_gpu(mel_padded).float()
        speaker_ids = to_gpu(speaker_ids.data).long()
        text_lengths = to_gpu(text_lengths).long()
        output_lengths = to_gpu(output_lengths).long()
        alignments = to_gpu(alignments).float()
        if torchmoji_hidden is not None:
            torchmoji_hidden = to_gpu(torchmoji_hidden).float()
        perc_loudness = to_gpu(perc_loudness).float()
        f0 = to_gpu(f0).float()
        energy = to_gpu(energy).float()
        sylps = to_gpu(sylps).float()
        voiced_mask = to_gpu(voiced_mask).bool()
        char_f0 = to_gpu(char_f0).float()
        char_voiced = to_gpu(char_voiced).float()
        char_energy = to_gpu(char_energy).float()

        return ((text_padded, mel_padded, speaker_ids, text_lengths,
                 output_lengths, alignments, torchmoji_hidden, perc_loudness,
                 f0, energy, sylps, voiced_mask, char_f0, char_voiced,
                 char_energy), (mel_padded, text_lengths, output_lengths,
                                perc_loudness, f0, energy, sylps, voiced_mask,
                                char_f0, char_voiced, char_energy))