def collate_vocoder(batch): mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets] mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)] labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)] mels = np.stack(mels).astype(np.float32) labels = np.stack(labels).astype(np.int64) mels = torch.tensor(mels) labels = torch.tensor(labels).long() x = labels[:, :hp.voc_seq_len] y = labels[:, 1:] bits = 16 if hp.voc_mode == 'MOL' else hp.bits x = audio.label_2_float(x.float(), bits) if hp.voc_mode == 'MOL' : y = audio.label_2_float(y.float(), bits) return x, y, mels
def __getitem__(self, index): i = random.randrange(1, self.raw_data.shape[1] - self.sample_frames) sig = self.wav[int(hparams.hop_size*(i-0.5))-1:int(hparams.hop_size*(i-0.5+self.sample_frames))] assert len(sig) == self.sample_frames * hparams.hop_size + 1 prev = audio.label_2_float(sig[:-1], hparams.bits) return torch.Tensor(self.speaker), torch.Tensor(self.raw_data[:,i:i+self.sample_frames].T), \ torch.Tensor(prev), torch.Tensor(sig[1:]).long()
def __getitem__(self, index): video_length = int(hparams.hop_size / hparams.sample_rate * self.sample_frames * 20) # video = np.zeros((video_length, 3, 256, 256)) video = torch.Tensor(video_length, 3, 128, 128) if self.use_256: video_large = torch.Tensor(video_length, 3, 256, 256) else: video_large = torch.Tensor(video_length, 3, 512, 512) if not self.ret_wav: i = random.randrange(0, (self.raw_data.shape[1] - self.sample_frames + 1) // 32) * 32 video_index = int(i / 4) for j in range(video_length): video[j, :, :, :] = self.transform(Image.open(self.list_frame[j + video_index]).convert('RGB')) video_large[j, :, :, :] = self.transform_large(Image.open(self.list_frame[j + video_index]).convert('RGB')) return torch.Tensor(self.speaker), torch.Tensor(self.raw_data[:,i:i+self.sample_frames].T), video, video_large else: i = random.randrange(1, (self.raw_data.shape[1] - self.sample_frames) // 32) * 32 sig = self.wav[int(hparams.hop_size*(i-0.5))-1:int(hparams.hop_size*(i-0.5+self.sample_frames))] assert len(sig) == self.sample_frames * hparams.hop_size + 1 prev = audio.label_2_float(sig[:-1], hparams.bits) video_index = int(i / 4) for j in range(video_length): video[j, :, :, :] = self.transform(Image.open(self.list_frame[j + video_index]).convert('RGB')) video_large[j, :, :, :] = self.transform_large(Image.open(self.list_frame[j + video_index]).convert('RGB')) assert not (np.isnan(prev).any() or np.isnan(sig).any()) return torch.Tensor(self.speaker), torch.Tensor(self.raw_data[:,i:i+self.sample_frames].T), \ torch.Tensor(prev), torch.Tensor(sig[1:]).long(), video, video_large
def collate_vocoder(batch): mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad #max_offsets = [0, x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch] max_offsets = [max(1, x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad)) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets] #mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)] #labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)] mels=[] labels=[] for i, x in enumerate(batch): sliced_mel=x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] if len(sliced_mel[0])<mel_win: sliced_mel=np.pad(sliced_mel, [(0, 0), (0, mel_win-len(sliced_mel[0]))], mode='constant', constant_values=-hp.mel_max_abs_value) print("padded mel with %f" % -hp.mel_max_abs_value) assert len(sliced_mel[0])==mel_win mels.append(sliced_mel) # additional one for future sliced_sig=x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] if len(sliced_sig)<hp.voc_seq_len+1: sliced_sig=np.pad(sliced_sig, (0, hp.voc_seq_len+1-len(sliced_sig)), mode='constant', constant_values=0) print("padded seq with 0") assert len(sliced_sig)==hp.voc_seq_len+1 labels.append(sliced_sig) mels = np.stack(mels).astype(np.float32) labels = np.stack(labels).astype(np.int64) mels = torch.tensor(mels) labels = torch.tensor(labels).long() x = labels[:, :hp.voc_seq_len] y = labels[:, 1:] bits = 16 if hp.voc_mode == 'MOL' else hp.bits x = audio.label_2_float(x.float(), bits) if hp.voc_mode == 'MOL' : y = audio.label_2_float(y.float(), bits) # cur [B, L], future [B, L] bit label, mels [B, D, T] return x, y, mels