def reprocess(batch, cut_list): texts = [batch[ind]["text"] for ind in cut_list] cembs = [batch[ind]["cemb"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] # mel_gt_targets = [batch[ind]["mel_gt_target"] for ind in cut_list] mel_tac2_targets = [batch[ind]["mel_tac2_target"] for ind in cut_list] length_text = np.array([]) for text in texts: length_text = np.append(length_text, text.shape[0]) length_mel = np.array(list()) for mel in mel_tac2_targets: length_mel = np.append(length_mel, mel.shape[0]) texts = pad_1D(texts) Ds = pad_1D(Ds) # mel_gt_targets = pad_2D(mel_gt_targets) mel_tac2_targets = pad_2D(mel_tac2_targets) cembs = pad_2D(cembs) out = { "text": texts, "mel_tac2_target": mel_tac2_targets, "cemb": cembs, "D": Ds, "length_mel": length_mel, "length_text": length_text } return out
def reprocess(self, batch, cut_list): ids = [batch[ind]["id"] for ind in cut_list] texts = [batch[ind]["text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] mel_augs = [batch[ind]["mel_aug"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] f0s = [batch[ind]["f0"] for ind in cut_list] f0_norms = [batch[ind]["f0_norm"] for ind in cut_list] f0_norm_augs = [batch[ind]["f0_norm_aug"] for ind in cut_list] energies = [batch[ind]["energy"] for ind in cut_list] energy_inputs = [batch[ind]["energy_input"] for ind in cut_list] energy_input_augs = [ batch[ind]["energy_input_aug"] for ind in cut_list ] speaker_embed = [batch[ind]["speaker_embed"] for ind in cut_list] for text, D, id_ in zip(texts, Ds, ids): if len(text) != len(D): print(text, text.shape, D, D.shape, id_) length_text = np.array(list()) for text in texts: length_text = np.append(length_text, text.shape[0]) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) texts = pad_1D(texts) Ds = pad_1D(Ds) mel_targets = pad_2D(mel_targets) mel_augs = pad_2D(mel_augs) f0s = pad_1D(f0s) f0_norms = pad_1D(f0_norms) f0_norm_augs = pad_1D(f0_norm_augs) energies = pad_1D(energies) energy_inputs = pad_1D(energy_inputs) energy_input_augs = pad_1D(energy_input_augs) log_Ds = np.log(Ds + hparams.log_offset) speaker_embeds = np.concatenate(speaker_embed, axis=0) out = { "id": ids, "text": texts, "mel_target": mel_targets, "mel_aug": mel_augs, "D": Ds, "log_D": log_Ds, "f0": f0s, "f0_norm": f0_norms, "f0_norm_aug": f0_norm_augs, "energy": energies, "energy_input": energy_inputs, "energy_input_aug": energy_input_augs, "speaker_embed": speaker_embeds, "src_len": length_text, "mel_len": length_mel } return out
def reprocess(batch, cut_list): C1s = [batch[ind]["condition1"] for ind in cut_list] C2s = [batch[ind]["condition2"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] norm_f0s = [batch[ind]["norm_f0"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] mel_ins = [batch[ind]["mel_in"] for ind in cut_list] length_C = np.array([]) for C in C1s: length_C = np.append(length_C, C.shape[0]) src_pos = list() max_len = int(max(length_C)) for length_src_row in length_C: src_pos.append( np.pad([i + 1 for i in range(int(length_src_row))], (0, max_len - int(length_src_row)), 'constant')) src_pos = np.array(src_pos) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) mel_pos = list() lens = torch.LongTensor(length_mel) max_mel_len = int(max(length_mel)) for length_mel_row in length_mel: mel_pos.append( np.pad([i + 1 for i in range(int(length_mel_row))], (0, max_mel_len - int(length_mel_row)), 'constant')) mel_pos = np.array(mel_pos) C1s = pad_1D(C1s) C2s = pad_1D(C2s) Ds = pad_1D(Ds) norm_f0s = pad_2D(norm_f0s, maxlen=max_mel_len) mel_targets = pad_2D(mel_targets, maxlen=max_mel_len) mel_ins = pad_2D(mel_ins, maxlen=max_mel_len) out = { "condition1": C1s, "condition2": C2s, "mel_target": mel_targets, "norm_f0": norm_f0s, "mel_in": mel_ins, "D": Ds, "mel_pos": mel_pos, "src_pos": src_pos, "lens": lens, "mel_max_len": max_mel_len } return out
def preprocess_audio(mel, energy, f0, f0_norm): mel = utils.pad_2D(mel[None]) f0 = utils.pad_1D(f0[None]) f0_norm = utils.pad_1D(f0_norm[None]) energy = utils.pad_1D(energy[None]) mel_target = torch.from_numpy(mel).float().to(device) mel_len = torch.from_numpy(np.array([mel.shape[1]])).long().to(device) f0 = torch.from_numpy(f0).float().to(device) f0_norm = torch.from_numpy(f0_norm).float().to(device) energy = torch.from_numpy(energy).float().to(device) return mel_target, mel_len, energy, f0, f0_norm
def reprocess(self, batch, cut_list): ids = [batch[ind]["id"] for ind in cut_list] texts = [batch[ind]["text"] for ind in cut_list] if hp.with_hanzi: hz_texts = [batch[ind]["hz_text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] # f0s = [batch[ind]["f0"] for ind in cut_list] # energies = [batch[ind]["energy"] for ind in cut_list] for text, D, id_ in zip(texts, Ds, ids): if len(text) != len(D): print('error:', text, text.shape, D, D.shape, id_) length_text = np.array(list()) for text in texts: length_text = np.append(length_text, text.shape[0]) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) texts = pad_1D(texts) if hp.with_hanzi: hz_texts = pad_1D(hz_texts) else: hz_texts = None Ds = [d - hp.duration_mean for d in Ds] Ds = pad_1D(Ds) mel_targets = pad_2D(mel_targets) # f0s = None#pad_1D(f0s) # energies = None#pad_1D(energies) #log_Ds = np.log(Ds + hp.log_offset) out = { "id": ids, "text": texts, "hz_text": hz_texts, "mel_target": mel_targets, "D": Ds, "log_D": Ds, #"#f0": f0s, #"energy": energies, "src_len": length_text, "mel_len": length_mel } return out
def reprocess(self, batch, cut_list): ids = [batch[ind]["id"] for ind in cut_list] if hp.use_spk_embed: if hp.dataset == "VCTK" or hp.dataset == "LibriTTS": spk_ids = [self.spk_table[_id.split("_")[0]] for _id in ids] else: raise NotImplementedError( "Looking up datset {} speaker table not implemented". format(hp.dataset)) texts = [batch[ind]["text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] f0s = [batch[ind]["f0"] for ind in cut_list] energies = [batch[ind]["energy"] for ind in cut_list] for text, D, id_ in zip(texts, Ds, ids): if len(text) != len(D): print(text, text.shape, D, D.shape, id_) length_text = np.array(list()) for text in texts: length_text = np.append(length_text, text.shape[0]) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) texts = pad_1D(texts) Ds = pad_1D(Ds) mel_targets = pad_2D(mel_targets) f0s = pad_1D(f0s) energies = pad_1D(energies) log_Ds = np.log(Ds + hp.log_offset) out = { "id": ids, "text": texts, "mel_target": mel_targets, "D": Ds, "log_D": log_Ds, "f0": f0s, "energy": energies, "src_len": length_text, "mel_len": length_mel } if hp.use_spk_embed: out.update({"spk_ids": spk_ids}) return out
def reprocess(self, batch, cut_list): texts = [batch[ind]["text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] f0s = [batch[ind]["f0"] for ind in cut_list] energies = [batch[ind]["energy"] for ind in cut_list] length_text = np.array([]) for text in texts: length_text = np.append(length_text, text.shape[0]) src_pos = list() max_len = int(max(length_text)) for length_src_row in length_text: src_pos.append( np.pad([i + 1 for i in range(int(length_src_row))], (0, max_len - int(length_src_row)), 'constant')) src_pos = np.array(src_pos) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) mel_pos = list() max_mel_len = int(max(length_mel)) for length_mel_row in length_mel: mel_pos.append( np.pad([i + 1 for i in range(int(length_mel_row))], (0, max_mel_len - int(length_mel_row)), 'constant')) mel_pos = np.array(mel_pos) texts = pad_1D(texts) Ds = pad_1D(Ds) mel_targets = pad_2D(mel_targets) f0s = pad_1D(f0s) energies = pad_1D(energies) out = { "text": texts, "mel_target": mel_targets, "D": Ds, "f0": f0s, "energy": energies, "mel_pos": mel_pos, "src_pos": src_pos, "mel_len": length_mel } return out
def reprocess(self, batch, cut_list): ids = [batch[ind]["id"] for ind in cut_list] texts = [batch[ind]["text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] f0s = [ min_max_norm(batch[ind]["f0"], min_val=hparams.f0_min, max_val=hparams.f0_max) for ind in cut_list ] energies = [ min_max_norm(batch[ind]["energy"], min_val=hparams.energy_min, max_val=hparams.energy_max) for ind in cut_list ] for text, D, id_ in zip(texts, Ds, ids): if len(text) != len(D): print('the dimension of text and duration should be the same') print('text: ', sequence_to_text(text)) print(text, text.shape, D, D.shape, id_) length_text = np.array(list()) for text in texts: length_text = np.append(length_text, text.shape[0]) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) texts = pad_1D(texts) Ds = pad_1D(Ds) mel_targets = pad_2D(mel_targets) f0s = pad_1D(f0s) energies = pad_1D(energies) log_Ds = np.log(Ds + hparams.log_offset) out = { "id": ids, "text": texts, "mel_target": mel_targets, "D": Ds, "log_D": log_Ds, "f0": f0s, "energy": energies, "src_len": length_text, "mel_len": length_mel } return out
def reprocess(batch, cut_list): texts = [batch[ind]["text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] stop_tokens = [batch[ind]["stop_token"] for ind in cut_list] length_text = np.array([]) for text in texts: length_text = np.append(length_text, text.shape[0]) src_pos = list() max_len = int(max(length_text)) for length_src_row in length_text: src_pos.append( np.pad([i + 1 for i in range(int(length_src_row))], (0, max_len - int(length_src_row)), 'constant')) src_pos = np.array(src_pos) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) mel_pos = list() max_mel_len = int(max(length_mel)) for length_mel_row in length_mel: mel_pos.append( np.pad([i + 1 for i in range(int(length_mel_row))], (0, max_mel_len - int(length_mel_row)), 'constant')) mel_pos = np.array(mel_pos) texts = pad_1D(texts) Ds = pad_1D(Ds) mel_targets = pad_2D(mel_targets) stop_tokens = pad_1D(stop_tokens, PAD=1.) out = { "text": texts, "mel_target": mel_targets, "D": Ds, "stop_token": stop_tokens, "mel_pos": mel_pos, "src_pos": src_pos, "mel_max_len": max_mel_len } return out
def reprocess(self, batch, cut_list): ids = [batch[ind]["id"] for ind in cut_list] texts = [batch[ind]["text"] for ind in cut_list] mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] f0s = [batch[ind]["f0"] for ind in cut_list] energies = [batch[ind]["energy"] for ind in cut_list] # text(音素)和duration要一样多,duration里的数都加起来是mel谱,f0,energy的帧数 for text, D, id_ in zip(texts, Ds, ids): if len(text) != len(D): print(text, text.shape, D, D.shape, id_) # 音素个数,mel谱的帧数,不一样 length_text = np.array(list()) for text in texts: length_text = np.append(length_text, text.shape[0]) length_mel = np.array(list()) for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) texts = pad_1D(texts) Ds = pad_1D(Ds) mel_targets = pad_2D( mel_targets) #mel是<831, 80>, 831是时间帧数,80是频率上有80个系数做feature f0s = pad_1D(f0s) energies = pad_1D(energies) log_Ds = np.log(Ds + hparams.log_offset) out = { "id": ids, "text": texts, "mel_target": mel_targets, "D": Ds, "log_D": log_Ds, "f0": f0s, "energy": energies, "src_len": length_text, #音素个数 "mel_len": length_mel } #帧数 return out
def __call__(self, batch): # batch[i] is a tuple of __getitem__ outputs new_batch = [] print(len(batch)) print(len(batch[0])) print(len(batch[1])) print(len(batch[2])) print("BEFORE batch[0][0].shape:", batch[0][0].shape) print("BEFORE batch[0][1].shape:", batch[0][1].shape) print("BEFORE batch[0][2].shape:", batch[0][2].shape) # for token in batch: # aa, b, c = token # len_crop = np.random.randint(self.min_len_seq, self.max_len_seq+1, size=2) # 1.5s ~ 3s # # print("len_crop:", len_crop) # # print(aa.shape, b.shape, c.shape, len(aa)) # try: # left = np.random.randint(0, len(aa)-len_crop[0], size=2) # except: # len_crop[0], left = len(aa), [0, 0] # # pdb.set_trace() # a = aa[left[0]:left[0]+len_crop[0], :] # c = c[left[0]:left[0]+len_crop[0]] # a = np.clip(a, 0, 1) # a_pad = np.pad(a, ((0,self.max_len_pad-a.shape[0]),(0,0)), 'constant') # c_pad = np.pad(c[:,np.newaxis], ((0,self.max_len_pad-c.shape[0]),(0,0)), 'constant', constant_values=-1e10) # new_batch.append( (a_pad, b, c_pad, len_crop[0]) ) # batch = new_batch # a, b, c, d = zip(*new_batch) # melsp = torch.from_numpy(np.stack(a, axis=0)) # spk_emb = torch.from_numpy(np.stack(b, axis=0)) # pitch = torch.from_numpy(np.stack(c, axis=0)) # len_org = torch.from_numpy(np.stack(d, axis=0)) # print("AFTER new_batch[0][0].shape:", new_batch[0][0].shape) # print("AFTER new_batch[0][1].shape:", new_batch[0][1].shape) # print("AFTER new_batch[0][2].shape:", new_batch[0][2].shape) # print("AFTER melsp.shape:", melsp.shape) # print("AFTER spk_emb.shape:", spk_emb.shape) # print("AFTER pitch.shape:", pitch.shape) # print("AFTER len_org.shape:", len_org.shape) # exit(0) a = [sb[0] for sb in batch] b = [sb[1] for sb in batch] c = [sb[2] for sb in batch] len_org = np.array(list()) for mel in a: len_org = np.append(len_org, mel.shape[0]) print("len_org:", len_org) melsp = torch.from_numpy(pad_2D(a)) spk_emb = torch.from_numpy(np.array(b)) pitch = torch.from_numpy(pad_1D(c)).unsqueeze(-1) len_org = torch.from_numpy(len_org) print("AFTER batch[0][0].shape:", melsp[0].shape) print("AFTER batch[0][1].shape:", spk_emb[0].shape) print("AFTER batch[0][2].shape:", pitch[0].shape) print("AFTER melsp.shape:", melsp.shape) print("AFTER spk_emb.shape:", spk_emb.shape) print("AFTER pitch.shape:", pitch.shape) print("AFTER len_org.shape:", len_org.shape) return melsp, spk_emb, pitch, len_org
def reprocess(self, batch, cut_list): ids = [batch[ind]["id"] for ind in cut_list] conditions = [batch[ind]["condition"] for ind in cut_list] mel_refers = [batch[ind]["mel_refer"] for ind in cut_list] if hp.vocoder == 'WORLD': ap_targets = [batch[ind]["ap_target"] for ind in cut_list] sp_targets = [batch[ind]["sp_target"] for ind in cut_list] else: mel_targets = [batch[ind]["mel_target"] for ind in cut_list] Ds = [batch[ind]["D"] for ind in cut_list] f0s = [batch[ind]["f0"] for ind in cut_list] energies = [batch[ind]["energy"] for ind in cut_list] for condition, D, id_ in zip(conditions, Ds, ids): if len(condition) != len(D): print(condition, condition.shape, D, D.shape, id_) length_condition = np.array(list()) for condition in conditions: length_condition = np.append(length_condition, condition.shape[0]) length_mel = np.array(list()) if hp.vocoder == 'WORLD': for mel in sp_targets: length_mel = np.append(length_mel, mel.shape[0]) else: for mel in mel_targets: length_mel = np.append(length_mel, mel.shape[0]) conditions = pad_2D(conditions) Ds = pad_1D(Ds) mel_refers = pad_2D(mel_refers) if hp.vocoder == 'WORLD': ap_targets = pad_2D(ap_targets) sp_targets = pad_2D(sp_targets) # print(ap_targets.shape,sp_targets.shape) else: mel_targets = pad_2D(mel_targets) f0s = pad_1D(f0s) energies = pad_1D(energies) log_Ds = np.log(Ds + hp.log_offset) if hp.vocoder == 'WORLD': out = { "id": ids, "condition": conditions, "mel_refer": mel_refers, "ap_target": ap_targets, "sp_target": sp_targets, "D": Ds, "log_D": log_Ds, "f0": f0s, "energy": energies, "src_len": length_condition, "mel_len": length_mel } else: out = { "id": ids, "condition": conditions, "mel_refer": mel_refers, "mel_target": mel_targets, "D": Ds, "log_D": log_Ds, "f0": f0s, "energy": energies, "src_len": length_condition, "mel_len": length_mel } return out