def reconstruct_audio(filename, tier_to_breakdown): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) final_reconstruction = None # Verify that tier 2 is conditionally generated from just tier 1 assert (breakdown[2][0] == breakdown[1][1] ).all(), "Tier 2 not created from Tier 1" for tier in range(2, 7): source = tier_to_breakdown[tier][0] target = tier_to_breakdown[tier][1] source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) reconstructed_mel_tensor = tierutil.interleave(source_tensor, target_tensor, tier + 1) reconstructed_mel = reconstructed_mel_tensor.numpy()[0] # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier if tier < 6: next_tier = tier_to_breakdown[tier + 1][0] assert (reconstructed_mel == next_tier).all( ), "Tier %d not created from Tier %d" % (tier + 1, tier) else: final_reconstruction = reconstructed_mel print('reconstructing audio...') reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) self.file_list = [] self.root_dir = hp.data.path txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.file_list.append(wav_path) # Just to ensure the data always comes in the right order random.Random(123).shuffle(self.file_list) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = [] # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)): # wav = read_wav_np(f) # duraton = (len(wav)/hp.audio.sr) # if duraton < hp.audio.duration: # self.file_list.append(f) self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') # open file in read mode with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.dataset.append((wav_path, parsed_sentence)) random.Random(123).shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def deconstruct_audio(wav): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) mel = melgen.get_normalized_mel(wav) tier_to_breakdown = {} for tier in range(1, 7): source, target = tierutil.cut_divide_tiers(mel, tier) print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape)) tier_to_breakdown[tier] = (source, target) tier_to_breakdown[7] = (mel, mel) return tier_to_breakdown
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] if hp.data.name == 'KSS': with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'), 'r') as f: lines = f.read().splitlines() for line in tqdm(lines): wav_name, _, _, text, length, _ = line.split('|') wav_path = os.path.join(self.root_dir, 'kss', wav_name) duraton = float(length) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) # if len(self.dataset) > 100: break elif hp.data.name.startswith('Blizzard'): with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] for filename, sentence in tqdm(zip(filenames, sentences), total=len(filenames)): wav_path = os.path.join(self.root_dir, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) else: raise NotImplementedError random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'complete_blizzard/train_prompts.gui' if train else 'complete_blizzard/test_prompts.gui') # txt_file_list = glob.glob( # os.path.join(txt_path, '**', '*.txt'), # recursive=True # ) # for txt_filepath in tqdm(txt_file_list, total=len(txt_file_list)): # wav_filepath = txt_filepath.replace('_txt', '_wav').replace('.txt', '.wav') # f = open(txt_filepath, "r") # sentence = f.read().strip() # f.close() # # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_filepath, hp.audio.sr) # if length < hp.audio.duration and length > 0.56 and len(sentence) > 5: # self.dataset.append((wav_filepath, sentence)) with open(txt_path, 'r') as f: lines = f.read().splitlines() wav_paths = lines[::2] sentences = lines[1::2] for wav_path, sentence in tqdm( zip(wav_paths, sentences), desc='Audio/text data loader for %s' % txt_path, total=len(wav_paths)): # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_path, hp.audio.sr) # if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) random.seed(123) random.shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
class AudioOnlyDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.file_list) def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # # Reconstruct audio for testing # filename = os.path.basename(self.file_list[idx]) # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel) # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source) # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target) # self.melgen.save_audio('source_'+filename, wav) # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier) # reconstructed_mel = reconstructed_mel_tensor.numpy() # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % ( # mel.shape, # source.shape, # target.shape, # reconstructed_mel.shape, # )) # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel) # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio) return source, target
class AudioTextDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'), 'r') as f: lines = f.read().splitlines() for line in lines: wav_name, _, _, text, _ = line.split('|') wav_name = wav_name[2:-4] + '.wav' wav_path = os.path.join(self.root_dir, 'wavs', wav_name) wav = read_wav_np(wav_path) duraton = (len(wav) / hp.audio.sr) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) #if len(self.dataset) > 100: break random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset[idx][1] seq = text_to_sequence(text) wav = read_wav_np(self.dataset[idx][0]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) self.file_list = [] # if train: # self.file_list = glob.glob( # os.path.join(hp.data.path, 'complete_blizzard/train_wav', '**', hp.data.extension), # recursive=True # ) # else: # self.file_list = glob.glob( # os.path.join(hp.data.path, 'complete_blizzard/test_wav', '**', hp.data.extension), # recursive=True # ) txt_path = 'datasets/complete_blizzard/train_prompts.gui' if train else 'datasets/complete_blizzard/test_prompts.gui' with open(txt_path, 'r') as f: lines = f.read().splitlines() wav_paths = lines[::2] for wav_path in tqdm(wav_paths, desc='Audio data loader', total=len(wav_paths)): # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_path, hp.audio.sr) # if length < hp.audio.duration: self.file_list.append(wav_path) # Just to ensure the data always comes in the right order random.seed(123) random.shuffle(self.file_list) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'), 'r') as f: lines = f.read().splitlines() for line in lines: wav_name, _, _, text, _ = line.split('|') wav_name = wav_name[2:-4] + '.wav' wav_path = os.path.join(self.root_dir, 'wavs', wav_name) wav = read_wav_np(wav_path) duraton = (len(wav) / hp.audio.sr) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) #if len(self.dataset) > 100: break random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
class AudioOnlyDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = [] # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)): # wav = read_wav_np(f) # duraton = (len(wav)/hp.audio.sr) # if duraton < hp.audio.duration: # self.file_list.append(f) self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.file_list) def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return source, target
class CompleteAudioTextDatasetv3(AudioTextDataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') # open file in read mode with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.dataset.append((wav_path, parsed_sentence)) random.Random(123).shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __getitem__(self, idx): sentence = self.dataset[idx][1] seq = seq_to_array(sentence) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) reconstructed_mel_tensor = tierutil.interleave(source_tensor, target_tensor, tier + 1) reconstructed_mel = reconstructed_mel_tensor.numpy()[0] # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier if tier < 6: next_tier = tier_to_breakdown[tier + 1][0] assert (reconstructed_mel == next_tier).all( ), "Tier %d not created from Tier %d" % (tier + 1, tier) else: final_reconstruction = reconstructed_mel print('reconstructing audio...') reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('reconstructed_' + filename, reconstructed_audio) breakdown = None audio_files = get_audio() for filename, wav in audio_files: breakdown = deconstruct_audio(wav) reconstruct_audio(filename, breakdown) hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) melgen.save_audio('original_' + filename, wav) print('') print('') break
# inference_breakdown[i][0] # save_image('tier%d_inferred_breakdown_%s.png' % (i, filename), inference_breakdown[i][0]) # save_image('final_inferred_%s.png' % filename, inferred) tier = 5 source = breakdown[tier][0] print("Source tier 5 shape: %s" % str(source.shape)) save_image('source_tier_%d_%s.png' % (tier, filename), breakdown[tier][0]) inferred_source_6, inferred_5 = run_inference_on_tier(source, tier, text, timestep) print("inferred tier 5 target shape: %s" % str(inferred_5.shape)) print("inferred tier 6 source shape: %s" % str(inferred_source_6.shape)) tier = 6 inferred_final, inferred_6 = run_inference_on_tier(inferred_source_6, tier, text, timestep) print("inferred tier 6 target shape: %s" % str(inferred_6.shape)) print("inferred final shape: %s" % str(inferred_final.shape)) print("original final shape: %s" % str(breakdown[tier+1][0].shape)) save_image('target_tier_%d_%s.png' % (tier, filename), breakdown[tier][1]) save_image('next_tier_%d_%s.png' % (tier, filename), breakdown[tier+1][0]) save_image('inferred_tier_%d_%s.png' % (tier, filename), inferred_6) save_image('inferred_next_tier_%d_%s.png' % (tier, filename), inferred_final) # Save the actual audio hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) source_wav = melgen.reconstruct_audio(breakdown[tier+1][0]) inference_wav = melgen.reconstruct_audio(inferred_final) melgen.save_audio('source_'+filename, source_wav) melgen.save_audio('inference_'+filename, inference_wav) break
required=False, help="Input for conditional generation, leave empty for unconditional") return parser.parse_args(args) if __name__ == '__main__': args = parse_args(sys.argv[1:]) hp = HParam(args.config) infer_hp = HParam(args.infer_config) assert args.timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep) melgen = MelGen(hp) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): generated = model.sample(args.input) os.makedirs('temp', exist_ok=True) torch.save(generated, os.path.join('temp', args.name + '.pt')) spectrogram = plot_spectrogram_to_numpy( generated[0].cpu().detach().numpy()) plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0))) # waveform, wavespec = Reconstruct(hp).inverse(generated[0])
def save_audio(filename, final_reconstruction): hp = HParam('./config/blizzard_alldata_v5.yaml') melgen = MelGen(hp) reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('temp/reconstructed_' + filename, reconstructed_audio)
class AudioTextDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] if hp.data.name == 'KSS': with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'), 'r') as f: lines = f.read().splitlines() for line in tqdm(lines): wav_name, _, _, text, length, _ = line.split('|') wav_path = os.path.join(self.root_dir, 'kss', wav_name) duraton = float(length) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) # if len(self.dataset) > 100: break elif hp.data.name.startswith('Blizzard'): with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] for filename, sentence in tqdm(zip(filenames, sentences), total=len(filenames)): wav_path = os.path.join(self.root_dir, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) else: raise NotImplementedError random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset[idx][1] if self.hp.data.name == 'KSS': seq = text_to_sequence(text) elif self.hp.data.name.startswith('Blizzard'): seq = process_blizzard(text) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # print(text) return seq, source, target