class AudioOnlyDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.file_list) def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # # Reconstruct audio for testing # filename = os.path.basename(self.file_list[idx]) # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel) # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source) # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target) # self.melgen.save_audio('source_'+filename, wav) # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier) # reconstructed_mel = reconstructed_mel_tensor.numpy() # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % ( # mel.shape, # source.shape, # target.shape, # reconstructed_mel.shape, # )) # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel) # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio) return source, target
def deconstruct_audio(wav): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) mel = melgen.get_normalized_mel(wav) tier_to_breakdown = {} for tier in range(1, 7): source, target = tierutil.cut_divide_tiers(mel, tier) print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape)) tier_to_breakdown[tier] = (source, target) tier_to_breakdown[7] = (mel, mel) return tier_to_breakdown
class AudioTextDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'), 'r') as f: lines = f.read().splitlines() for line in lines: wav_name, _, _, text, _ = line.split('|') wav_name = wav_name[2:-4] + '.wav' wav_path = os.path.join(self.root_dir, 'wavs', wav_name) wav = read_wav_np(wav_path) duraton = (len(wav) / hp.audio.sr) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) #if len(self.dataset) > 100: break random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset[idx][1] seq = text_to_sequence(text) wav = read_wav_np(self.dataset[idx][0]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
class AudioOnlyDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.file_list = [] # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)): # wav = read_wav_np(f) # duraton = (len(wav)/hp.audio.sr) # if duraton < hp.audio.duration: # self.file_list.append(f) self.file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(self.file_list) if train: self.file_list = self.file_list[:int(0.95 * len(self.file_list))] else: self.file_list = self.file_list[int(0.95 * len(self.file_list)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.file_list) def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return source, target
class CompleteAudioTextDatasetv3(AudioTextDataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] txt_path = os.path.join( self.root_dir, 'blizzard_train.csv' if train else 'blizzard_test.csv') # open file in read mode with open(txt_path, 'r') as read_obj: csv_reader = csv.reader(read_obj) headers = next(csv_reader) for row in csv_reader: [original_sentence, parsed_sentence, wav_path, wav_length] = row if float(wav_length) < hp.audio.duration and float( wav_length) > 0.4: self.dataset.append((wav_path, parsed_sentence)) random.Random(123).shuffle(self.dataset) self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __getitem__(self, idx): sentence = self.dataset[idx][1] seq = seq_to_array(sentence) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
class AudioTextDataset(Dataset): def __init__(self, hp, args, train): self.hp = hp self.args = args self.train = train self.data = hp.data.path self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) # this will search all files within hp.data.path self.root_dir = hp.data.path self.dataset = [] if hp.data.name == 'KSS': with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'), 'r') as f: lines = f.read().splitlines() for line in tqdm(lines): wav_name, _, _, text, length, _ = line.split('|') wav_path = os.path.join(self.root_dir, 'kss', wav_name) duraton = float(length) if duraton < hp.audio.duration: self.dataset.append((wav_path, text)) # if len(self.dataset) > 100: break elif hp.data.name.startswith('Blizzard'): with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] for filename, sentence in tqdm(zip(filenames, sentences), total=len(filenames)): wav_path = os.path.join(self.root_dir, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: self.dataset.append((wav_path, sentence)) else: raise NotImplementedError random.seed(123) random.shuffle(self.dataset) if train: self.dataset = self.dataset[:int(0.95 * len(self.dataset))] else: self.dataset = self.dataset[int(0.95 * len(self.dataset)):] self.wavlen = int(hp.audio.sr * hp.audio.duration) self.tier = self.args.tier self.melgen = MelGen(hp) self.tierutil = TierUtil(hp) def __len__(self): return len(self.dataset) def __getitem__(self, idx): text = self.dataset[idx][1] if self.hp.data.name == 'KSS': seq = text_to_sequence(text) elif self.hp.data.name.startswith('Blizzard'): seq = process_blizzard(text) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # print(text) return seq, source, target