def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') # open file in read mode with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.dataset.append((wav_path, parsed_sentence)) random.Random(123).shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = [] # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)): # wav = read_wav_np(f) # duraton = (len(wav)/hp.audio.sr) # if duraton < hp.audio.duration: # self.file_list.append(f) self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) self.file_list = [] self.root_dir = hp.data.path txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.file_list.append(wav_path) # Just to ensure the data always comes in the right order random.Random(123).shuffle(self.file_list) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] if hp.data.name == 'KSS': with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'), 'r') as f: lines = f.read().splitlines() for line in tqdm(lines): wav_name, _, _, text, length, _ = line.split('|') wav_path = os.path.join(self.root_dir, 'kss', wav_name) duraton = float(length) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) # if len(self.dataset) > 100: break elif hp.data.name.startswith('Blizzard'): with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] for filename, sentence in tqdm(zip(filenames, sentences), total=len(filenames)): wav_path = os.path.join(self.root_dir, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) else: raise NotImplementedError random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'complete_blizzard/train_prompts.gui' if train else 'complete_blizzard/test_prompts.gui') # txt_file_list = glob.glob( # os.path.join(txt_path, '**', '*.txt'), # recursive=True # ) # for txt_filepath in tqdm(txt_file_list, total=len(txt_file_list)): # wav_filepath = txt_filepath.replace('_txt', '_wav').replace('.txt', '.wav') # f = open(txt_filepath, "r") # sentence = f.read().strip() # f.close() # # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_filepath, hp.audio.sr) # if length < hp.audio.duration and length > 0.56 and len(sentence) > 5: # self.dataset.append((wav_filepath, sentence)) with open(txt_path, 'r') as f: lines = f.read().splitlines() wav_paths = lines[::2] sentences = lines[1::2] for wav_path, sentence in tqdm( zip(wav_paths, sentences), desc='Audio/text data loader for %s' % txt_path, total=len(wav_paths)): # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_path, hp.audio.sr) # if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) random.seed(123) random.shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def reconstruct_audio(filename, tier_to_breakdown): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) final_reconstruction = None # Verify that tier 2 is conditionally generated from just tier 1 assert (breakdown[2][0] == breakdown[1][1] ).all(), "Tier 2 not created from Tier 1" for tier in range(2, 7): source = tier_to_breakdown[tier][0] target = tier_to_breakdown[tier][1] source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) reconstructed_mel_tensor = tierutil.interleave(source_tensor, target_tensor, tier + 1) reconstructed_mel = reconstructed_mel_tensor.numpy()[0] # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier if tier < 6: next_tier = tier_to_breakdown[tier + 1][0] assert (reconstructed_mel == next_tier).all( ), "Tier %d not created from Tier %d" % (tier + 1, tier) else: final_reconstruction = reconstructed_mel print('reconstructing audio...') reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
def deconstruct_audio(wav): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) mel = melgen.get_normalized_mel(wav) tier_to_breakdown = {} for tier in range(1, 7): source, target = tierutil.cut_divide_tiers(mel, tier) print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape)) tier_to_breakdown[tier] = (source, target) tier_to_breakdown[7] = (mel, mel) return tier_to_breakdown
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) self.file_list = [] # if train: # self.file_list = glob.glob( # os.path.join(hp.data.path, 'complete_blizzard/train_wav', '**', hp.data.extension), # recursive=True # ) # else: # self.file_list = glob.glob( # os.path.join(hp.data.path, 'complete_blizzard/test_wav', '**', hp.data.extension), # recursive=True # ) txt_path = 'datasets/complete_blizzard/train_prompts.gui' if train else 'datasets/complete_blizzard/test_prompts.gui' with open(txt_path, 'r') as f: lines = f.read().splitlines() wav_paths = lines[::2] for wav_path in tqdm(wav_paths, desc='Audio data loader', total=len(wav_paths)): # Skip the length filtering below because we already filtered the dataset # length = get_length(wav_path, hp.audio.sr) # if length < hp.audio.duration: self.file_list.append(wav_path) # Just to ensure the data always comes in the right order random.seed(123) random.shuffle(self.file_list) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'), 'r') as f: lines = f.read().splitlines() for line in lines: wav_name, _, _, text, _ = line.split('|') wav_name = wav_name[2:-4] + '.wav' wav_path = os.path.join(self.root_dir, 'wavs', wav_name) wav = read_wav_np(wav_path) duraton = (len(wav) / hp.audio.sr) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) #if len(self.dataset) > 100: break random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp)
# inference_breakdown[i][0] # save_image('tier%d_inferred_breakdown_%s.png' % (i, filename), inference_breakdown[i][0]) # save_image('final_inferred_%s.png' % filename, inferred) tier = 5 source = breakdown[tier][0] print("Source tier 5 shape: %s" % str(source.shape)) save_image('source_tier_%d_%s.png' % (tier, filename), breakdown[tier][0]) inferred_source_6, inferred_5 = run_inference_on_tier(source, tier, text, timestep) print("inferred tier 5 target shape: %s" % str(inferred_5.shape)) print("inferred tier 6 source shape: %s" % str(inferred_source_6.shape)) tier = 6 inferred_final, inferred_6 = run_inference_on_tier(inferred_source_6, tier, text, timestep) print("inferred tier 6 target shape: %s" % str(inferred_6.shape)) print("inferred final shape: %s" % str(inferred_final.shape)) print("original final shape: %s" % str(breakdown[tier+1][0].shape)) save_image('target_tier_%d_%s.png' % (tier, filename), breakdown[tier][1]) save_image('next_tier_%d_%s.png' % (tier, filename), breakdown[tier+1][0]) save_image('inferred_tier_%d_%s.png' % (tier, filename), inferred_6) save_image('inferred_next_tier_%d_%s.png' % (tier, filename), inferred_final) # Save the actual audio hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) source_wav = melgen.reconstruct_audio(breakdown[tier+1][0]) inference_wav = melgen.reconstruct_audio(inferred_final) melgen.save_audio('source_'+filename, source_wav) melgen.save_audio('inference_'+filename, inference_wav) break
def save_audio(filename, final_reconstruction): hp = HParam('./config/blizzard_alldata_v5.yaml') melgen = MelGen(hp) reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('temp/reconstructed_' + filename, reconstructed_audio)