def __init__( self, root=DATA_ROOTS['librispeech'], train=True, small=False, spectral_transforms=False, wavform_transforms=True, test_url='dev-clean', max_length=150526, input_size=224, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() # choose to either apply augmentation at wavform or at augmentation level assert not (spectral_transforms and wavform_transforms) if train: if small: self.dataset = LIBRISPEECH(root, url='train-clean-100', download=True, folder_in_archive='LibriSpeech') else: self.dataset1 = LIBRISPEECH(root, url='train-clean-100', download=True, folder_in_archive='LibriSpeech') self.dataset2 = LIBRISPEECH(root, url='train-clean-360', download=True, folder_in_archive='LibriSpeech') self.dataset3 = LIBRISPEECH(root, url='train-other-500', download=True, folder_in_archive='LibriSpeech') else: self.dataset = LIBRISPEECH(root, url=test_url, download=True, folder_in_archive='LibriSpeech') self.spectral_transforms = spectral_transforms self.wavform_transforms = wavform_transforms self.max_length = max_length self.train = train self.small = small all_speaker_ids = self.get_speaker_ids() unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers self.num_labels = num_unique_speakers self.input_size = input_size self.FILTER_SIZE = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev
def process(args): out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) # Extract features feature_root = out_root / "fbank80" feature_root.mkdir(exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" extract_fbank_features( wav, sample_rate, feature_root / f"{sample_id}.npy" ) # Pack features into ZIP zip_path = out_root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(out_root.as_posix(), url=split) for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" manifest["id"].append(sample_id) manifest["audio"].append(audio_paths[sample_id]) manifest["n_frames"].append(audio_lengths[sample_id]) manifest["tgt_text"].append(utt.lower()) manifest["speaker"].append(spk_id) save_df_to_tsv( pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" ) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), out_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( out_root, spm_filename=spm_filename_prefix + ".model", specaugment_policy="ld" ) # Clean up shutil.rmtree(feature_root)
def process(args): os.makedirs(args.output_root, exist_ok=True) # Extract features feature_root = op.join(args.output_root, "fbank80") os.makedirs(feature_root, exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(args.output_root, url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f"{spk_id}-{chapter_id}-{utt_id}" extract_fbank_features(wav, sample_rate, op.join(feature_root, f"{sample_id}.npy")) # Pack features into ZIP zip_filename = "fbank80.zip" zip_path = op.join(args.output_root, zip_filename) print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(args.output_root, url=split) for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f"{spk_id}-{chapter_id}-{utt_id}" manifest["id"].append(sample_id) manifest["audio"].append(zip_manifest[sample_id]) duration_ms = int(wav.size(1) / sample_rate * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(utt) manifest["speaker"].append(spk_id) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, f"{split}.tsv")) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( f.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + ".model", specaugment_policy="ld") # Clean up shutil.rmtree(feature_root)
def download_dataset(folder="./data"): """ Download LIBRISPEECH train-clean-100 and test-clean in the `data` folder :param folder: the folder where the data is to be downloaded :output train_dataset: the training dataset tensor :output test_dataset: the validation dataset tensor """ if not os.path.isdir(folder): os.makedirs(folder) train_dataset = LIBRISPEECH(folder, url="train-clean-100",) #download=True) validation_dataset = LIBRISPEECH(folder, url="test-clean",) #download=True) return train_dataset, validation_dataset
def process(args): os.makedirs(args.output_root, exist_ok=True) # Extract features feature_root = op.join(args.output_root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in SPLITS: print(f'Fetching split {split}...') dataset = LIBRISPEECH(args.output_root, url=split, download=True) print('Extracting log mel filter bank features...') for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f'{spk_id}-{chapter_id}-{utt_id}' extract_fbank_features(wav, sample_rate, op.join(feature_root, f'{sample_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(args.output_root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print('Generating manifest...') train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(args.output_root, url=split) for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f'{spk_id}-{chapter_id}-{utt_id}' manifest['id'].append(sample_id) manifest['audio'].append(zip_manifest[sample_id]) duration_ms = int(wav.size(1) / sample_rate * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) manifest['tgt_text'].append(utt) manifest['speaker'].append(spk_id) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, f'{split}.tsv')) if split.startswith('train'): train_text.extend(manifest['tgt_text']) # Generate vocab vocab_size = '' if args.vocab_type == 'char' else str(args.vocab_size) spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size}' with NamedTemporaryFile(mode='w') as f: for t in train_text: f.write(t + '\n') gen_vocab(f.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + '.model', specaugment_policy='ld') # Clean up shutil.rmtree(feature_root)
def create(tag): if isinstance(tag, str): data = LIBRISPEECH( root, tag, folder_in_archive=folder_in_archive, download=False) else: data = torch.utils.data.ConcatDataset([LIBRISPEECH( root, t, folder_in_archive=folder_in_archive, download=False) for t in tag]) data = Processed(process_datapoint, data) data = diskcache_iterator(data) # data = MapMemoryCache(data) return data
def __init__( self, root, train=True, spectral_transforms=False, wavform_transforms=True, train_urls=[ 'train-clean-100', 'train-clean-360', 'train-other-500', ], test_url='dev-clean', max_length=150526, input_size=112, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() # choose to either apply augmentation at wavform or at augmentation level assert not (spectral_transforms and wavform_transforms) if train: datasets = [] for train_url in train_urls: dataset = LIBRISPEECH(root, url=train_url, download=True, folder_in_archive='LibriSpeech') datasets.append(dataset) self.dataset = ConcatDatasets(datasets) else: self.dataset = LIBRISPEECH(root, url=test_url, download=True, folder_in_archive='LibriSpeech') self.wavform_transforms = wavform_transforms self.spectral_transforms = spectral_transforms self.max_length = max_length self.train = train self.input_size = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev all_speaker_ids = self.get_speaker_ids() unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers
def __init__( self, root, train=True, spectral_transforms=False, wavform_transforms=False, max_length=150526, input_size=112, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() assert not (spectral_transforms and wavform_transforms) self.dataset = LIBRISPEECH(root, url='dev-clean', download=True, folder_in_archive='LibriSpeech') all_speaker_ids = self.get_speaker_ids(self.dataset) unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers self.num_labels = num_unique_speakers self.indices = self.train_test_split(self.dataset, all_speaker_ids, train=train) self.spectral_transforms = spectral_transforms self.wavform_transforms = wavform_transforms self.max_length = max_length self.train = train self.input_size = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev
def main(args): corpus_root = Path(args.corpus_root).absolute() output_root = Path(args.output_root).absolute() corpus_root.mkdir(exist_ok=True) output_root.mkdir(exist_ok=True) for split in SPLITS: logger.info(f"Preparing data for split {split}...") output_dir = output_root / split.replace("-", "_") output_dir.mkdir(exist_ok=True) wave_file = output_dir / "wav.txt" text_file = output_dir / "text.txt" if os.path.exists(wave_file) and os.path.exists(text_file): logger.info( f"Both {wave_file} and {text_file} exist, skip regenerating") continue dataset = LIBRISPEECH(corpus_root.as_posix(), url=split, folder_in_archive=args.folder_in_archive, download=args.download) with open(wave_file, "w", encoding="utf-8") as wave_f, open( text_file, "w", encoding="utf-8") as text_f: for data_tuple in tqdm(dataset): if len(data_tuple) == 6: # torchaudio=0.7.0 # (waveform, sample_rate, text, speaker_id, chapter_id, utterance_idx) text, speaker_id, chapter_id, utterance_idx = data_tuple[ 2], data_tuple[3], data_tuple[4], data_tuple[5] else: # torchaudio>=0.8.0 # (waveform, sample_rate, orignal_text, normalized_text, speaker_id, chapter_id, utterance_idx) assert len(data_tuple) == 7 text, speaker_id, chapter_id, utterance_idx = data_tuple[ 3], data_tuple[4], data_tuple[5], data_tuple[6] utterance_idx = str(utterance_idx).zfill(4) utterance_id = f"{speaker_id}-{chapter_id}-{utterance_idx}" utterance_path = os.path.join(corpus_root.as_posix(), args.folder_in_archive, split, str(speaker_id), str(chapter_id), utterance_id) print(f"{utterance_id} {utterance_path}.flac", file=wave_f) print(f"{utterance_id} {text}", file=text_f)
def create(tags, cache=True): if isinstance(tags, str): tags = [tags] if isinstance(transforms, list): transform_list = transforms else: transform_list = [transforms] data = torch.utils.data.ConcatDataset( [ Processed( LIBRISPEECH( root, tag, folder_in_archive=folder_in_archive, download=False, ), transform, language_model.encode, ) for tag, transform in zip(tags, transform_list) ] ) data = MapMemoryCache(data) return data
parser.add_argument('--folds_eval', default=[8,9], type=int, nargs='+', help='List of BIRD folds for validation') parser.add_argument('--folds_test', default=[10], type=int, nargs='+', help='List of BIRD folds for test') args = parser.parse_args() # This holds the RIR dataset for training, validation and testing folder_in_archive_rir = 'Bird' rir_train = BIRD(root=args.root, folder_in_archive=folder_in_archive_rir, folds=args.folds_train) rir_eval = BIRD(root=args.root, folder_in_archive=folder_in_archive_rir, folds=args.folds_eval) rir_test = BIRD(root=args.root, folder_in_archive=folder_in_archive_rir, folds=args.folds_test) # This holds the speech dataset for training, validation and testing folder_in_archive_speech = 'LibriSpeech' speech_train = LIBRISPEECH(root=args.root, folder_in_archive=folder_in_archive_speech, url='train-clean-100', download=True) speech_eval = LIBRISPEECH(root=args.root, folder_in_archive=folder_in_archive_speech, url='dev-clean', download=True) speech_test = LIBRISPEECH(root=args.root, folder_in_archive=folder_in_archive_speech, url='test-clean', download=True) # We can simply create augmented data with this training dataset augmented_train = RT60(rir=rir_train, speech=speech_train, samples_count=10000) Ys, rt60 = augmented_train[5] Y1 = torch.squeeze(Ys[0,:,:,:], dim=0) Y2 = torch.squeeze(Ys[1,:,:,:], dim=0) print(rt60) plt.subplot(2,1,1)
class LibriSpeech(Dataset): def __init__( self, root, train=True, spectral_transforms=False, wavform_transforms=True, train_urls=[ 'train-clean-100', 'train-clean-360', 'train-other-500', ], test_url='dev-clean', max_length=150526, input_size=112, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() # choose to either apply augmentation at wavform or at augmentation level assert not (spectral_transforms and wavform_transforms) if train: datasets = [] for train_url in train_urls: dataset = LIBRISPEECH(root, url=train_url, download=True, folder_in_archive='LibriSpeech') datasets.append(dataset) self.dataset = ConcatDatasets(datasets) else: self.dataset = LIBRISPEECH(root, url=test_url, download=True, folder_in_archive='LibriSpeech') self.wavform_transforms = wavform_transforms self.spectral_transforms = spectral_transforms self.max_length = max_length self.train = train self.input_size = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev all_speaker_ids = self.get_speaker_ids() unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers def get_speaker_ids(self): if self.train: speaker_ids = [] for dataset in self.dataset.datasets: speaker_ids_i = self._get_speaker_ids(dataset) speaker_ids.append(speaker_ids_i) return np.concatenate(speaker_ids) else: return self._get_speaker_ids(self.dataset) def _get_speaker_ids(self, dataset): speaker_ids = [] for i in range(len(dataset)): fileid = dataset._walker[i] speaker_id = self.load_librispeech_speaker_id( fileid, dataset._path, dataset._ext_audio, dataset._ext_txt, ) speaker_ids.append(speaker_id) return np.array(speaker_ids) def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt): speaker_id, _, _ = fileid.split("-") return int(speaker_id) def __getitem__(self, index): if index in BAD_LIBRISPEECH_INDICES: index = index + 1 wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index) speaker_id = self.speaker_id_map[speaker_id] wavform = np.asarray(wavform[0]) if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=LIBRISPEECH_HOP_LENGTH_DICT[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, speaker_id def __len__(self): return len(self.dataset)
class LibriSpeechTransfer(Dataset): """ Divide the dev-clean split of LibriSpeech into train and test splits by speaker so we can train a logreg fairly. """ def __init__( self, root, train=True, spectral_transforms=False, wavform_transforms=False, max_length=150526, input_size=112, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() assert not (spectral_transforms and wavform_transforms) self.dataset = LIBRISPEECH(root, url='dev-clean', download=True, folder_in_archive='LibriSpeech') all_speaker_ids = self.get_speaker_ids(self.dataset) unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers self.num_labels = num_unique_speakers self.indices = self.train_test_split(self.dataset, all_speaker_ids, train=train) self.spectral_transforms = spectral_transforms self.wavform_transforms = wavform_transforms self.max_length = max_length self.train = train self.input_size = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev def get_speaker_ids(self, dataset): speaker_ids = [] for i in range(len(dataset)): fileid = dataset._walker[i] speaker_id = self.load_librispeech_speaker_id( fileid, dataset._path, dataset._ext_audio, dataset._ext_txt, ) speaker_ids.append(speaker_id) return np.array(speaker_ids) def train_test_split(self, dataset, speaker_ids, train=True): rs = np.random.RandomState(42) # fix seed so reproducible splitting unique_speaker_ids = sorted(set(speaker_ids)) unique_speaker_ids = np.array(unique_speaker_ids) # train test split to ensure the 80/20 splits train_indices, test_indices = [], [] for speaker_id in unique_speaker_ids: speaker_indices = np.where(speaker_ids == speaker_id)[0] size = len(speaker_indices) rs.shuffle(speaker_indices) train_size = int(0.8 * size) train_indices.extend(speaker_indices[:train_size].tolist()) test_indices.extend(speaker_indices[train_size:].tolist()) return train_indices if train else test_indices def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt): speaker_id, _, _ = fileid.split("-") return int(speaker_id) def __getitem__(self, index): # NOTE: overwrite index with our custom indices mapping exapmles # to the training and test splits index = self.indices[index] try: wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index) except: index2 = (index + 1) % len(self.dataset) wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index2) speaker_id = self.speaker_id_map[speaker_id] wavform = np.asarray(wavform[0]) if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800} spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=hop_length_dict[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, speaker_id def __len__(self): return len(self.indices)
def __init__(self, path, subset, percent): self.libri_dataset = LIBRISPEECH(path, url=subset, download=False) if percent != 1.0: self.libri_dataset = get_subset(self.libri_dataset, percent) self.path = path
class LibriSpeech(Dataset): def __init__( self, root=DATA_ROOTS['librispeech'], train=True, small=False, spectral_transforms=False, wavform_transforms=True, test_url='dev-clean', max_length=150526, input_size=224, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() # choose to either apply augmentation at wavform or at augmentation level assert not (spectral_transforms and wavform_transforms) if train: if small: self.dataset = LIBRISPEECH(root, url='train-clean-100', download=True, folder_in_archive='LibriSpeech') else: self.dataset1 = LIBRISPEECH(root, url='train-clean-100', download=True, folder_in_archive='LibriSpeech') self.dataset2 = LIBRISPEECH(root, url='train-clean-360', download=True, folder_in_archive='LibriSpeech') self.dataset3 = LIBRISPEECH(root, url='train-other-500', download=True, folder_in_archive='LibriSpeech') else: self.dataset = LIBRISPEECH(root, url=test_url, download=True, folder_in_archive='LibriSpeech') self.spectral_transforms = spectral_transforms self.wavform_transforms = wavform_transforms self.max_length = max_length self.train = train self.small = small all_speaker_ids = self.get_speaker_ids() unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers self.num_labels = num_unique_speakers self.input_size = input_size self.FILTER_SIZE = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev def get_speaker_ids(self): if self.train and not self.small: speaker_ids_1 = self._get_speaker_ids(self.dataset1) speaker_ids_2 = self._get_speaker_ids(self.dataset2) speaker_ids_3 = self._get_speaker_ids(self.dataset3) return np.concatenate( [speaker_ids_1, speaker_ids_2, speaker_ids_3]) else: return self._get_speaker_ids(self.dataset) def _get_speaker_ids(self, dataset): speaker_ids = [] for i in range(len(dataset)): fileid = dataset._walker[i] speaker_id = self.load_librispeech_speaker_id( fileid, dataset._path, dataset._ext_audio, dataset._ext_txt, ) speaker_ids.append(speaker_id) return np.array(speaker_ids) def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt): speaker_id, _, _ = fileid.split("-") return int(speaker_id) def __getitem__(self, index): if self.train and not self.small: if index >= (len(self.dataset1) + len(self.dataset2)): try: wavform, sample_rate, _, speaker_id, _, _ = \ self.dataset3.__getitem__(index - len(self.dataset1) - len(self.dataset2)) except: index2 = (index - len(self.dataset1) - len(self.dataset2) + 1) % len(self.dataset3) wavform, sample_rate, _, speaker_id, _, _ = self.dataset3( index2) elif index >= len(self.dataset1): try: wavform, sample_rate, _, speaker_id, _, _ = \ self.dataset2.__getitem__(index - len(self.dataset1)) except: index2 = (index - len(self.dataset1) + 1) % len( self.dataset2) wavform, sample_rate, _, speaker_id, _, _ = self.dataset2.__getitem__( index2) else: try: wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__( index) except: index2 = (index + 1) % len(self.dataset) wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__( index2) else: try: wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index) except: index2 = (index + 1) % len(self.dataset) wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index2) speaker_id = self.speaker_id_map[speaker_id] wavform = np.asarray(wavform[0]) if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800} spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=hop_length_dict[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, speaker_id def __len__(self): if self.train and not self.small: return len(self.dataset1) + len(self.dataset2) + len(self.dataset3) else: return len(self.dataset)
def get_dataset(datadir, data_url): if not os.path.exists(datadir): os.makedirs(datadir) dataset = LIBRISPEECH(root=datadir, url=data_url, download=True) return dataset