def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) self.file_list = [] self.root_dir = hp.data.path txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.file_list.append(wav_path) # Just to ensure the data always comes in the right order random.Random(123).shuffle(self.file_list) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, infer_hp): super(MelNet, self).__init__() self.hp = hp self.args = args self.infer_hp = infer_hp self.f_div = f_div[hp.model.tier + 1] self.t_div = t_div[hp.model.tier] self.n_mels = hp.audio.n_mels self.tierutil = TierUtil(hp) if infer_hp.conditional: self.tiers = [ TTS(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[1], layers=hp.model.layers[0]) ] + [ Tier(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[tier], layers=hp.model.layers[tier - 1], tierN=tier) for tier in range(2, hp.model.tier + 1) ] else: self.tiers = [ Tier(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[tier], layers=hp.model.layers[tier - 1], tierN=tier) for tier in range(1, hp.model.tier + 1) ] self.tiers = nn.ModuleList( [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers])
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = [] # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)): # wav = read_wav_np(f) # duraton = (len(wav)/hp.audio.sr) # if duraton < hp.audio.duration: # self.file_list.append(f) self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') # open file in read mode with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.dataset.append((wav_path, parsed_sentence)) random.Random(123).shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def reconstruct_audio(filename, tier_to_breakdown): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) final_reconstruction = None # Verify that tier 2 is conditionally generated from just tier 1 assert (breakdown[2][0] == breakdown[1][1] ).all(), "Tier 2 not created from Tier 1" for tier in range(2, 7): source = tier_to_breakdown[tier][0] target = tier_to_breakdown[tier][1] source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) reconstructed_mel_tensor = tierutil.interleave(source_tensor, target_tensor, tier + 1) reconstructed_mel = reconstructed_mel_tensor.numpy()[0] # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier if tier < 6: next_tier = tier_to_breakdown[tier + 1][0] assert (reconstructed_mel == next_tier).all( ), "Tier %d not created from Tier %d" % (tier + 1, tier) else: final_reconstruction = reconstructed_mel print('reconstructing audio...') reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
def deconstruct_audio(wav): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) mel = melgen.get_normalized_mel(wav) tier_to_breakdown = {} for tier in range(1, 7): source, target = tierutil.cut_divide_tiers(mel, tier) print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape)) tier_to_breakdown[tier] = (source, target) tier_to_breakdown[7] = (mel, mel) return tier_to_breakdown
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] if hp.data.name == 'KSS': with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'), 'r') as f: lines = f.read().splitlines() for line in tqdm(lines): wav_name, _, _, text, length, _ = line.split('|') wav_path = os.path.join(self.root_dir, 'kss', wav_name) duraton = float(length) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) # if len(self.dataset) > 100: break elif hp.data.name.startswith('Blizzard'): with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] for filename, sentence in tqdm(zip(filenames, sentences), total=len(filenames)): wav_path = os.path.join(self.root_dir, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) else: raise NotImplementedError random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'complete_blizzard/train_prompts.gui' if train else 'complete_blizzard/test_prompts.gui') # txt_file_list = glob.glob( # os.path.join(txt_path, '**', '*.txt'), # recursive=True # ) # for txt_filepath in tqdm(txt_file_list, total=len(txt_file_list)): # wav_filepath = txt_filepath.replace('_txt', '_wav').replace('.txt', '.wav') # f = open(txt_filepath, "r") # sentence = f.read().strip() # f.close() # # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_filepath, hp.audio.sr) # if length < hp.audio.duration and length > 0.56 and len(sentence) > 5: # self.dataset.append((wav_filepath, sentence)) with open(txt_path, 'r') as f: lines = f.read().splitlines() wav_paths = lines[::2] sentences = lines[1::2] for wav_path, sentence in tqdm( zip(wav_paths, sentences), desc='Audio/text data loader for %s' % txt_path, total=len(wav_paths)): # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_path, hp.audio.sr) # if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) random.seed(123) random.shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
class AudioOnlyDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.file_list) def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # # Reconstruct audio for testing # filename = os.path.basename(self.file_list[idx]) # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel) # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source) # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target) # self.melgen.save_audio('source_'+filename, wav) # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier) # reconstructed_mel = reconstructed_mel_tensor.numpy() # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % ( # mel.shape, # source.shape, # target.shape, # reconstructed_mel.shape, # )) # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel) # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio) return source, target
class AudioTextDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'), 'r') as f: lines = f.read().splitlines() for line in lines: wav_name, _, _, text, _ = line.split('|') wav_name = wav_name[2:-4] + '.wav' wav_path = os.path.join(self.root_dir, 'wavs', wav_name) wav = read_wav_np(wav_path) duraton = (len(wav) / hp.audio.sr) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) #if len(self.dataset) > 100: break random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset[idx][1] seq = text_to_sequence(text) wav = read_wav_np(self.dataset[idx][0]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) self.file_list = [] # if train: # self.file_list = glob.glob( # os.path.join(hp.data.path, 'complete_blizzard/train_wav', '**', hp.data.extension), # recursive=True # ) # else: # self.file_list = glob.glob( # os.path.join(hp.data.path, 'complete_blizzard/test_wav', '**', hp.data.extension), # recursive=True # ) txt_path = 'datasets/complete_blizzard/train_prompts.gui' if train else 'datasets/complete_blizzard/test_prompts.gui' with open(txt_path, 'r') as f: lines = f.read().splitlines() wav_paths = lines[::2] for wav_path in tqdm(wav_paths, desc='Audio data loader', total=len(wav_paths)): # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_path, hp.audio.sr) # if length < hp.audio.duration: self.file_list.append(wav_path) # Just to ensure the data always comes in the right order random.seed(123) random.shuffle(self.file_list) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'), 'r') as f: lines = f.read().splitlines() for line in lines: wav_name, _, _, text, _ = line.split('|') wav_name = wav_name[2:-4] + '.wav' wav_path = os.path.join(self.root_dir, 'wavs', wav_name) wav = read_wav_np(wav_path) duraton = (len(wav) / hp.audio.sr) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) #if len(self.dataset) > 100: break random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
class AudioOnlyDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = [] # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)): # wav = read_wav_np(f) # duraton = (len(wav)/hp.audio.sr) # if duraton < hp.audio.duration: # self.file_list.append(f) self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.file_list) def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return source, target
class CompleteAudioTextDatasetv3(AudioTextDataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') # open file in read mode with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.dataset.append((wav_path, parsed_sentence)) random.Random(123).shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __getitem__(self, idx): sentence = self.dataset[idx][1] seq = seq_to_array(sentence) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
class MelNet(nn.Module): def __init__(self, hp, args, infer_hp): super(MelNet, self).__init__() self.hp = hp self.args = args self.infer_hp = infer_hp self.f_div = f_div[hp.model.tier + 1] self.t_div = t_div[hp.model.tier] self.n_mels = hp.audio.n_mels self.tierutil = TierUtil(hp) if infer_hp.conditional: self.tiers = [ TTS(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[1], layers=hp.model.layers[0]) ] + [ Tier(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[tier], layers=hp.model.layers[tier - 1], tierN=tier) for tier in range(2, hp.model.tier + 1) ] else: self.tiers = [ Tier(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[tier], layers=hp.model.layers[tier - 1], tierN=tier, num_class=10) for tier in range(1, hp.model.tier + 1) ] self.tiers = nn.ModuleList( [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers]) def forward(self, x, tier_num): assert tier_num > 0, 'tier_num should be larger than 0, got %d' % tier_num return self.tiers[tier_num](x) def sample(self, condition): x = None if condition is not None: # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0) x = condition else: seq = torch.LongTensor([[0]]) # input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() if x is not None: audio_lengths = torch.LongTensor([x.size()[-1]]).cuda() else: audio_lengths = torch.LongTensor([0]).cuda() ## Tier 1 ## tqdm.write('Tier 1') if self.args.timestep == 0: mu, std, pi = self.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) return temp for t in tqdm(range(self.args.timestep // self.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(self.n_mels // self.f_div)): torch.cuda.synchronize() if self.infer_hp.conditional: # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths) break else: mu, std, pi = self.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) new_idx = audio_lengths.item() - 1 x[:, m, new_idx] = temp[:, m, new_idx] ## Tier 2~N ## for tier in tqdm(range(2, self.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) mu, std, pi = self.tiers[tier](x) temp = sample_gmm(mu, std, pi) x = self.tierutil.interleave(x, temp, tier + 1) return x def load_tiers(self): for idx, chkpt_path in enumerate(self.infer_hp.checkpoints): checkpoint = torch.load(chkpt_path) hp = load_hparam_str(checkpoint['hp_str']) if self.hp != hp: print('Warning: hp different in file %s' % chkpt_path) self.tiers[idx + 1].load_state_dict(checkpoint['model']) def sample_dependence(self, condition, label, dependence_length): x = None if condition is not None: # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0) x = condition else: seq = torch.LongTensor([[0]]) if x is not None: audio_lengths = torch.LongTensor([x.size()[-1]]).cuda() else: audio_lengths = torch.LongTensor([0]).cuda() for t in tqdm(range(self.args.timestep // self.t_div)): # audio_lengths += 1 if x is None: x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(self.n_mels // self.f_div)): torch.cuda.synchronize() if self.infer_hp.conditional: # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths) break else: class_label = torch.tensor( label, dtype=torch.long) if isinstance( label, int) else torch.LongTensor(label) if m == 0: mu, std, pi, h_t, h_c = self.tiers[1]( x[:, :, -dependence_length:], audio_lengths, class_label.cuda(non_blocking=True).unsqueeze(0), save_hidden=True, hidden_t=None, hidden_c=None) else: mu, std, pi = self.tiers[1]( x[:, :, -dependence_length:], audio_lengths, class_label.cuda(non_blocking=True).unsqueeze(0), save_hidden=False, hidden_t=h_t, hidden_c=h_c) temp = sample_gmm(mu, std, pi) new_idx = audio_lengths.item() - 1 x[:, m, -1] = temp[:, m, new_idx] return x
class AudioTextDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] if hp.data.name == 'KSS': with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'), 'r') as f: lines = f.read().splitlines() for line in tqdm(lines): wav_name, _, _, text, length, _ = line.split('|') wav_path = os.path.join(self.root_dir, 'kss', wav_name) duraton = float(length) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) # if len(self.dataset) > 100: break elif hp.data.name.startswith('Blizzard'): with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] for filename, sentence in tqdm(zip(filenames, sentences), total=len(filenames)): wav_path = os.path.join(self.root_dir, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) else: raise NotImplementedError random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset[idx][1] if self.hp.data.name == 'KSS': seq = text_to_sequence(text) elif self.hp.data.name.startswith('Blizzard'): seq = process_blizzard(text) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # print(text) return seq, source, target
class MelNet(nn.Module): def __init__(self, hp, args, infer_hp): super(MelNet, self).__init__() self.hp = hp self.args = args self.infer_hp = infer_hp self.f_div = f_div[hp.model.tier + 1] self.t_div = t_div[hp.model.tier] self.n_mels = hp.audio.n_mels self.tierutil = TierUtil(hp) if infer_hp.conditional: self.tiers = [ TTS(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[1], layers=hp.model.layers[0]) ] + [ Tier(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[tier], layers=hp.model.layers[tier - 1], tierN=tier) for tier in range(2, hp.model.tier + 1) ] else: self.tiers = [ Tier(hp=hp, freq=hp.audio.n_mels // self.f_div * f_div[tier], layers=hp.model.layers[tier - 1], tierN=tier) for tier in range(1, hp.model.tier + 1) ] self.tiers = nn.ModuleList( [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers]) def forward(self, x, tier_num): assert tier_num > 0, 'tier_num should be larger than 0, got %d' % tier_num return self.tiers[tier_num](x) def sample(self, condition): x = None seq = torch.from_numpy(process_blizzard(condition)).long().unsqueeze(0) input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() audio_lengths = torch.LongTensor([0]).cuda() ## Tier 1 ## tqdm.write('Tier 1') for t in tqdm(range(self.args.timestep // self.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(self.n_mels // self.f_div)): torch.cuda.synchronize() if self.infer_hp.conditional: mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths) else: mu, std, pi = self.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) x[:, m, t] = temp[:, m, t] ## Tier 2~N ## for tier in tqdm(range(2, self.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) mu, std, pi = self.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) x = self.tierutil.interleave(x, temp, tier + 1) return x def load_tiers(self): for idx, chkpt_path in enumerate(self.infer_hp.checkpoints): checkpoint = torch.load(chkpt_path) hp = load_hparam_str(checkpoint['hp_str']) if self.hp != hp: print('Warning: hp different in file %s' % chkpt_path) # print("Looking for:") # print(chkpt_path) # print("Tier") # print(idx+1) # # print(self.tiers) # print(self.tiers[idx+1]) self.tiers[idx + 1].load_state_dict(checkpoint['model'])