class TWEBDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, min_seq_len=0): with open(csv_file, "r") as f: self.frames = [line.split('\t') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading TWEB from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _sort_frames(self): r"""Sort sequences in ascending order""" lengths = np.array([len(ins[1]) for ins in self.frames]) print(" | > Max length sequence {}".format(np.max(lengths))) print(" | > Min length sequence {}".format(np.min(lengths))) print(" | > Avg length sequence {}".format(np.mean(lengths))) idxs = np.argsort(lengths) new_frames = [] ignored = [] for i, idx in enumerate(idxs): length = lengths[idx] if length < self.min_seq_len: ignored.append(idx) else: new_frames.append(self.frames[idx]) print(" | > {} instances are ignored by min_seq_len ({})".format( len(ignored), self.min_seq_len)) self.frames = new_frames def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. PAD sequences with the longest sequence in the batch 2. Convert Audio signal to Spectrograms. 3. PAD sequences that can be divided by r. 4. Convert Numpy to Torch tensors. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets stop_targets = [ np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) # PAD features with largest length + a zero frame linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # B x T x D linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[ 0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path.") parser.add_argument("--out_path", default=None, type=str, help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) CONFIG.audio['signal_norm'] = False # do not apply earlier normalization CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) # load the meta data of target dataset dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel**2).sum(axis=1) linear_square_sum += (linear**2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} stats['mel_mean'] = mel_mean stats['mel_std'] = mel_scale stats['linear_mean'] = linear_mean stats['linear_std'] = linear_scale print(f' > Avg mel spec mean: {mel_mean.mean()}') print(f' > Avg mel spec scale: {mel_scale.mean()}') print(f' > Avg linear spec mean: {linear_mean.mean()}') print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling CONFIG.audio['stats_path'] = output_file_path CONFIG.audio['signal_norm'] = True # remove redundant values del CONFIG.audio['max_norm'] del CONFIG.audio['min_level_db'] del CONFIG.audio['symmetric_norm'] del CONFIG.audio['clip_norm'] stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) print(f' > scale_stats.npy is saved to {output_file_path}')
class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power): with open(csv_file, "r") as f: self.frames = [line.split('|') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _sort_frames(self): r"""Sort sequences in ascending order""" lengths = np.array([len(ins[1]) for ins in self.frames]) print(" | > Max length sequence {}".format(np.max(lengths))) print(" | > Min length sequence {}".format(np.min(lengths))) print(" | > Avg length sequence {}".format(np.mean(lengths))) idxs = np.argsort(lengths) new_frames = [None] * len(lengths) for i, idx in enumerate(idxs): new_frames[i] = self.frames[idx] self.frames = new_frames def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample def get_dummy_data(self): r"""Get a dummy input for testing""" return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor) def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. PAD sequences with the longest sequence in the batch 2. Convert Audio signal to Spectrograms. 3. PAD sequences that can be divided by r. 4. Convert Numpy to Torch tensors. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) linear = np.array([self.ap.spectrogram(w).astype('float32') for w in wav]) mel = np.array([self.ap.melspectrogram(w).astype('float32') for w in wav]) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: pad_len = self.outputs_per_step - \ ((timesteps + 1) % self.outputs_per_step) pad_len += 1 else: pad_len = 1 linear = pad_per_step(linear, pad_len) mel = pad_per_step(mel, pad_len) # reshape jombo linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) return text, text_lenghts, linear, mel, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}" .format(type(batch[0]))))
mel = AP.melspectrogram(wav) print("Max:", mel.max()) print("Min:", mel.min()) print("Mean:", mel.mean()) plot_spectrogram(mel.T, AP); wav_gen = AP.inv_mel_spectrogram(mel) ipd.Audio(wav_gen, rate=AP.sample_rate) # ### Generate Linear-Spectrogram and Re-synthesis with GL # In[ ]: spec = AP.spectrogram(wav) print("Max:", spec.max()) print("Min:", spec.min()) print("Mean:", spec.mean()) plot_spectrogram(spec.T, AP); wav_gen = AP.inv_spectrogram(spec) ipd.Audio(wav_gen, rate=AP.sample_rate) # ### Compare values for a certain parameter # # Optimize your parameters by comparing different values per parameter at a time. # In[ ]:
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") parser.add_argument( "--data_path", type=str, required=False, help="folder including the target set of wavs overriding dataset config.", ) args, overrides = parser.parse_known_args() CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) # load config CONFIG.audio.signal_norm = False # do not apply earlier normalization CONFIG.audio.stats_path = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio.to_dict()) # load the meta data of target dataset if args.data_path: dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) else: dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel ** 2).sum(axis=1) linear_square_sum += (linear ** 2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) output_file_path = args.out_path stats = {} stats["mel_mean"] = mel_mean stats["mel_std"] = mel_scale stats["linear_mean"] = linear_mean stats["linear_std"] = linear_scale print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path CONFIG.audio.signal_norm = True # remove redundant values del CONFIG.audio.max_norm del CONFIG.audio.min_level_db del CONFIG.audio.symmetric_norm del CONFIG.audio.clip_norm stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}")
class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power): self.frames = pd.read_csv(csv_file, sep='|', header=None) self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames.ix[idx, 0]) + '.wav' text = self.frames.ix[idx, 1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames.ix[idx, 0]} return sample def get_dummy_data(self): return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor) def collate_fn(self, batch): # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) linear = np.array( [self.ap.spectrogram(w).astype('float32') for w in wav]) mel = np.array( [self.ap.melspectrogram(w).astype('float32') for w in wav]) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: pad_len = self.outputs_per_step - \ ((timesteps + 1) % self.outputs_per_step) pad_len += 1 else: pad_len = 1 linear = pad_per_step(linear, pad_len) mel = pad_per_step(mel, pad_len) # reshape jombo linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) return text, text_lenghts, linear, mel, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))