class LibriSpeechTransfer(Dataset): """ Divide the dev-clean split of LibriSpeech into train and test splits by speaker so we can train a logreg fairly. """ def __init__( self, root, train=True, spectral_transforms=False, wavform_transforms=False, max_length=150526, input_size=112, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() assert not (spectral_transforms and wavform_transforms) self.dataset = LIBRISPEECH(root, url='dev-clean', download=True, folder_in_archive='LibriSpeech') all_speaker_ids = self.get_speaker_ids(self.dataset) unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers self.num_labels = num_unique_speakers self.indices = self.train_test_split(self.dataset, all_speaker_ids, train=train) self.spectral_transforms = spectral_transforms self.wavform_transforms = wavform_transforms self.max_length = max_length self.train = train self.input_size = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev def get_speaker_ids(self, dataset): speaker_ids = [] for i in range(len(dataset)): fileid = dataset._walker[i] speaker_id = self.load_librispeech_speaker_id( fileid, dataset._path, dataset._ext_audio, dataset._ext_txt, ) speaker_ids.append(speaker_id) return np.array(speaker_ids) def train_test_split(self, dataset, speaker_ids, train=True): rs = np.random.RandomState(42) # fix seed so reproducible splitting unique_speaker_ids = sorted(set(speaker_ids)) unique_speaker_ids = np.array(unique_speaker_ids) # train test split to ensure the 80/20 splits train_indices, test_indices = [], [] for speaker_id in unique_speaker_ids: speaker_indices = np.where(speaker_ids == speaker_id)[0] size = len(speaker_indices) rs.shuffle(speaker_indices) train_size = int(0.8 * size) train_indices.extend(speaker_indices[:train_size].tolist()) test_indices.extend(speaker_indices[train_size:].tolist()) return train_indices if train else test_indices def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt): speaker_id, _, _ = fileid.split("-") return int(speaker_id) def __getitem__(self, index): # NOTE: overwrite index with our custom indices mapping exapmles # to the training and test splits index = self.indices[index] try: wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index) except: index2 = (index + 1) % len(self.dataset) wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index2) speaker_id = self.speaker_id_map[speaker_id] wavform = np.asarray(wavform[0]) if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800} spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=hop_length_dict[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, speaker_id def __len__(self): return len(self.indices)
class LibriSpeech(Dataset): def __init__( self, root, train=True, spectral_transforms=False, wavform_transforms=True, train_urls=[ 'train-clean-100', 'train-clean-360', 'train-other-500', ], test_url='dev-clean', max_length=150526, input_size=112, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() # choose to either apply augmentation at wavform or at augmentation level assert not (spectral_transforms and wavform_transforms) if train: datasets = [] for train_url in train_urls: dataset = LIBRISPEECH(root, url=train_url, download=True, folder_in_archive='LibriSpeech') datasets.append(dataset) self.dataset = ConcatDatasets(datasets) else: self.dataset = LIBRISPEECH(root, url=test_url, download=True, folder_in_archive='LibriSpeech') self.wavform_transforms = wavform_transforms self.spectral_transforms = spectral_transforms self.max_length = max_length self.train = train self.input_size = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev all_speaker_ids = self.get_speaker_ids() unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers def get_speaker_ids(self): if self.train: speaker_ids = [] for dataset in self.dataset.datasets: speaker_ids_i = self._get_speaker_ids(dataset) speaker_ids.append(speaker_ids_i) return np.concatenate(speaker_ids) else: return self._get_speaker_ids(self.dataset) def _get_speaker_ids(self, dataset): speaker_ids = [] for i in range(len(dataset)): fileid = dataset._walker[i] speaker_id = self.load_librispeech_speaker_id( fileid, dataset._path, dataset._ext_audio, dataset._ext_txt, ) speaker_ids.append(speaker_id) return np.array(speaker_ids) def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt): speaker_id, _, _ = fileid.split("-") return int(speaker_id) def __getitem__(self, index): if index in BAD_LIBRISPEECH_INDICES: index = index + 1 wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index) speaker_id = self.speaker_id_map[speaker_id] wavform = np.asarray(wavform[0]) if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=LIBRISPEECH_HOP_LENGTH_DICT[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, speaker_id def __len__(self): return len(self.dataset)
class LibriSpeech(Dataset): def __init__( self, root=DATA_ROOTS['librispeech'], train=True, small=False, spectral_transforms=False, wavform_transforms=True, test_url='dev-clean', max_length=150526, input_size=224, normalize_mean=LIBRISPEECH_MEAN, normalize_stdev=LIBRISPEECH_STDEV, ): super().__init__() # choose to either apply augmentation at wavform or at augmentation level assert not (spectral_transforms and wavform_transforms) if train: if small: self.dataset = LIBRISPEECH(root, url='train-clean-100', download=True, folder_in_archive='LibriSpeech') else: self.dataset1 = LIBRISPEECH(root, url='train-clean-100', download=True, folder_in_archive='LibriSpeech') self.dataset2 = LIBRISPEECH(root, url='train-clean-360', download=True, folder_in_archive='LibriSpeech') self.dataset3 = LIBRISPEECH(root, url='train-other-500', download=True, folder_in_archive='LibriSpeech') else: self.dataset = LIBRISPEECH(root, url=test_url, download=True, folder_in_archive='LibriSpeech') self.spectral_transforms = spectral_transforms self.wavform_transforms = wavform_transforms self.max_length = max_length self.train = train self.small = small all_speaker_ids = self.get_speaker_ids() unique_speaker_ids = sorted(list(set(all_speaker_ids))) num_unique_speakers = len(unique_speaker_ids) self.speaker_id_map = dict( zip(unique_speaker_ids, range(num_unique_speakers))) self.all_speaker_ids = np.array( [self.speaker_id_map[sid] for sid in all_speaker_ids]) self.num_unique_speakers = num_unique_speakers self.num_labels = num_unique_speakers self.input_size = input_size self.FILTER_SIZE = input_size self.normalize_mean = normalize_mean self.normalize_stdev = normalize_stdev def get_speaker_ids(self): if self.train and not self.small: speaker_ids_1 = self._get_speaker_ids(self.dataset1) speaker_ids_2 = self._get_speaker_ids(self.dataset2) speaker_ids_3 = self._get_speaker_ids(self.dataset3) return np.concatenate( [speaker_ids_1, speaker_ids_2, speaker_ids_3]) else: return self._get_speaker_ids(self.dataset) def _get_speaker_ids(self, dataset): speaker_ids = [] for i in range(len(dataset)): fileid = dataset._walker[i] speaker_id = self.load_librispeech_speaker_id( fileid, dataset._path, dataset._ext_audio, dataset._ext_txt, ) speaker_ids.append(speaker_id) return np.array(speaker_ids) def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt): speaker_id, _, _ = fileid.split("-") return int(speaker_id) def __getitem__(self, index): if self.train and not self.small: if index >= (len(self.dataset1) + len(self.dataset2)): try: wavform, sample_rate, _, speaker_id, _, _ = \ self.dataset3.__getitem__(index - len(self.dataset1) - len(self.dataset2)) except: index2 = (index - len(self.dataset1) - len(self.dataset2) + 1) % len(self.dataset3) wavform, sample_rate, _, speaker_id, _, _ = self.dataset3( index2) elif index >= len(self.dataset1): try: wavform, sample_rate, _, speaker_id, _, _ = \ self.dataset2.__getitem__(index - len(self.dataset1)) except: index2 = (index - len(self.dataset1) + 1) % len( self.dataset2) wavform, sample_rate, _, speaker_id, _, _ = self.dataset2.__getitem__( index2) else: try: wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__( index) except: index2 = (index + 1) % len(self.dataset) wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__( index2) else: try: wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index) except: index2 = (index + 1) % len(self.dataset) wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__( index2) speaker_id = self.speaker_id_map[speaker_id] wavform = np.asarray(wavform[0]) if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800} spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=hop_length_dict[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, speaker_id def __len__(self): if self.train and not self.small: return len(self.dataset1) + len(self.dataset2) + len(self.dataset3) else: return len(self.dataset)