Beispiel #1
0
    def __init__(
        self,
        root=DATA_ROOTS['librispeech'],
        train=True,
        small=False,
        spectral_transforms=False,
        wavform_transforms=True,
        test_url='dev-clean',
        max_length=150526,
        input_size=224,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        # choose to either apply augmentation at wavform or at augmentation level
        assert not (spectral_transforms and wavform_transforms)
        if train:
            if small:
                self.dataset = LIBRISPEECH(root,
                                           url='train-clean-100',
                                           download=True,
                                           folder_in_archive='LibriSpeech')
            else:
                self.dataset1 = LIBRISPEECH(root,
                                            url='train-clean-100',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
                self.dataset2 = LIBRISPEECH(root,
                                            url='train-clean-360',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
                self.dataset3 = LIBRISPEECH(root,
                                            url='train-other-500',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
        else:
            self.dataset = LIBRISPEECH(root,
                                       url=test_url,
                                       download=True,
                                       folder_in_archive='LibriSpeech')

        self.spectral_transforms = spectral_transforms
        self.wavform_transforms = wavform_transforms
        self.max_length = max_length
        self.train = train
        self.small = small
        all_speaker_ids = self.get_speaker_ids()
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
        self.num_labels = num_unique_speakers
        self.input_size = input_size
        self.FILTER_SIZE = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev
def process(args):
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)
    # Extract features
    feature_root = out_root / "fbank80"
    feature_root.mkdir(exist_ok=True)
    for split in SPLITS:
        print(f"Fetching split {split}...")
        dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True)
        print("Extracting log mel filter bank features...")
        for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
            extract_fbank_features(
                wav, sample_rate, feature_root / f"{sample_id}.npy"
            )
    # Pack features into ZIP
    zip_path = out_root / "fbank80.zip"
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    audio_paths, audio_lengths = get_zip_manifest(zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(out_root.as_posix(), url=split)
        for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
            manifest["id"].append(sample_id)
            manifest["audio"].append(audio_paths[sample_id])
            manifest["n_frames"].append(audio_lengths[sample_id])
            manifest["tgt_text"].append(utt.lower())
            manifest["speaker"].append(spk_id)
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv"
        )
        if split.startswith("train"):
            train_text.extend(manifest["tgt_text"])
    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(
            Path(f.name),
            out_root / spm_filename_prefix,
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(
        out_root,
        spm_filename=spm_filename_prefix + ".model",
        specaugment_policy="ld"
    )
    # Clean up
    shutil.rmtree(feature_root)
def process(args):
    os.makedirs(args.output_root, exist_ok=True)
    # Extract features
    feature_root = op.join(args.output_root, "fbank80")
    os.makedirs(feature_root, exist_ok=True)
    for split in SPLITS:
        print(f"Fetching split {split}...")
        dataset = LIBRISPEECH(args.output_root, url=split, download=True)
        print("Extracting log mel filter bank features...")
        for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_id}-{utt_id}"
            extract_fbank_features(wav, sample_rate,
                                   op.join(feature_root, f"{sample_id}.npy"))
    # Pack features into ZIP
    zip_filename = "fbank80.zip"
    zip_path = op.join(args.output_root, zip_filename)
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    zip_manifest = get_zip_manifest(args.output_root, zip_filename)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(args.output_root, url=split)
        for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_id}-{utt_id}"
            manifest["id"].append(sample_id)
            manifest["audio"].append(zip_manifest[sample_id])
            duration_ms = int(wav.size(1) / sample_rate * 1000)
            manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
            manifest["tgt_text"].append(utt)
            manifest["speaker"].append(spk_id)
        save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                       op.join(args.output_root, f"{split}.tsv"))
        if split.startswith("train"):
            train_text.extend(manifest["tgt_text"])
    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(
            f.name,
            op.join(args.output_root, spm_filename_prefix),
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + ".model",
                    specaugment_policy="ld")
    # Clean up
    shutil.rmtree(feature_root)
Beispiel #4
0
def download_dataset(folder="./data"):
	"""
	Download LIBRISPEECH train-clean-100 and test-clean in the `data` folder
	:param folder: the folder where the data is to be downloaded
	:output train_dataset: the training dataset tensor
	:output test_dataset: the validation dataset tensor
	"""
	if not os.path.isdir(folder):
		os.makedirs(folder)
	train_dataset = LIBRISPEECH(folder, url="train-clean-100",) #download=True)
	validation_dataset = LIBRISPEECH(folder, url="test-clean",) #download=True)
	return train_dataset, validation_dataset
Beispiel #5
0
def process(args):
    os.makedirs(args.output_root, exist_ok=True)
    # Extract features
    feature_root = op.join(args.output_root, 'fbank80')
    os.makedirs(feature_root, exist_ok=True)
    for split in SPLITS:
        print(f'Fetching split {split}...')
        dataset = LIBRISPEECH(args.output_root, url=split, download=True)
        print('Extracting log mel filter bank features...')
        for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f'{spk_id}-{chapter_id}-{utt_id}'
            extract_fbank_features(wav, sample_rate,
                                   op.join(feature_root, f'{sample_id}.npy'))
    # Pack features into ZIP
    zip_filename = 'fbank80.zip'
    zip_path = op.join(args.output_root, zip_filename)
    print('ZIPing features...')
    create_zip(feature_root, zip_path)
    print('Fetching ZIP manifest...')
    zip_manifest = get_zip_manifest(args.output_root, zip_filename)
    # Generate TSV manifest
    print('Generating manifest...')
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(args.output_root, url=split)
        for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f'{spk_id}-{chapter_id}-{utt_id}'
            manifest['id'].append(sample_id)
            manifest['audio'].append(zip_manifest[sample_id])
            duration_ms = int(wav.size(1) / sample_rate * 1000)
            manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
            manifest['tgt_text'].append(utt)
            manifest['speaker'].append(spk_id)
        save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                       op.join(args.output_root, f'{split}.tsv'))
        if split.startswith('train'):
            train_text.extend(manifest['tgt_text'])
    # Generate vocab
    vocab_size = '' if args.vocab_type == 'char' else str(args.vocab_size)
    spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size}'
    with NamedTemporaryFile(mode='w') as f:
        for t in train_text:
            f.write(t + '\n')
        gen_vocab(f.name, op.join(args.output_root, spm_filename_prefix),
                  args.vocab_type, args.vocab_size)
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + '.model',
                    specaugment_policy='ld')
    # Clean up
    shutil.rmtree(feature_root)
    def create(tag):

        if isinstance(tag, str):
            data = LIBRISPEECH(
                root, tag, folder_in_archive=folder_in_archive, download=False)
        else:
            data = torch.utils.data.ConcatDataset([LIBRISPEECH(
                root, t, folder_in_archive=folder_in_archive, download=False) for t in tag])

        data = Processed(process_datapoint, data)
        data = diskcache_iterator(data)
        # data = MapMemoryCache(data)
        return data
    def __init__(
        self,
        root,
        train=True,
        spectral_transforms=False,
        wavform_transforms=True,
        train_urls=[
            'train-clean-100',
            'train-clean-360',
            'train-other-500',
        ],
        test_url='dev-clean',
        max_length=150526,
        input_size=112,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        # choose to either apply augmentation at wavform or at augmentation level
        assert not (spectral_transforms and wavform_transforms)

        if train:
            datasets = []
            for train_url in train_urls:
                dataset = LIBRISPEECH(root,
                                      url=train_url,
                                      download=True,
                                      folder_in_archive='LibriSpeech')
                datasets.append(dataset)
            self.dataset = ConcatDatasets(datasets)
        else:
            self.dataset = LIBRISPEECH(root,
                                       url=test_url,
                                       download=True,
                                       folder_in_archive='LibriSpeech')

        self.wavform_transforms = wavform_transforms
        self.spectral_transforms = spectral_transforms
        self.max_length = max_length
        self.train = train
        self.input_size = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev
        all_speaker_ids = self.get_speaker_ids()
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
    def __init__(
        self,
        root,
        train=True,
        spectral_transforms=False,
        wavform_transforms=False,
        max_length=150526,
        input_size=112,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        assert not (spectral_transforms and wavform_transforms)
        self.dataset = LIBRISPEECH(root,
                                   url='dev-clean',
                                   download=True,
                                   folder_in_archive='LibriSpeech')

        all_speaker_ids = self.get_speaker_ids(self.dataset)
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
        self.num_labels = num_unique_speakers

        self.indices = self.train_test_split(self.dataset,
                                             all_speaker_ids,
                                             train=train)
        self.spectral_transforms = spectral_transforms
        self.wavform_transforms = wavform_transforms
        self.max_length = max_length
        self.train = train
        self.input_size = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev
Beispiel #9
0
def main(args):
    corpus_root = Path(args.corpus_root).absolute()
    output_root = Path(args.output_root).absolute()
    corpus_root.mkdir(exist_ok=True)
    output_root.mkdir(exist_ok=True)
    for split in SPLITS:
        logger.info(f"Preparing data for split {split}...")
        output_dir = output_root / split.replace("-", "_")
        output_dir.mkdir(exist_ok=True)
        wave_file = output_dir / "wav.txt"
        text_file = output_dir / "text.txt"
        if os.path.exists(wave_file) and os.path.exists(text_file):
            logger.info(
                f"Both {wave_file} and {text_file} exist, skip regenerating")
            continue
        dataset = LIBRISPEECH(corpus_root.as_posix(),
                              url=split,
                              folder_in_archive=args.folder_in_archive,
                              download=args.download)
        with open(wave_file, "w", encoding="utf-8") as wave_f, open(
                text_file, "w", encoding="utf-8") as text_f:
            for data_tuple in tqdm(dataset):
                if len(data_tuple) == 6:  # torchaudio=0.7.0
                    # (waveform, sample_rate, text, speaker_id, chapter_id, utterance_idx)
                    text, speaker_id, chapter_id, utterance_idx = data_tuple[
                        2], data_tuple[3], data_tuple[4], data_tuple[5]
                else:  # torchaudio>=0.8.0
                    # (waveform, sample_rate, orignal_text, normalized_text, speaker_id, chapter_id, utterance_idx)
                    assert len(data_tuple) == 7
                    text, speaker_id, chapter_id, utterance_idx = data_tuple[
                        3], data_tuple[4], data_tuple[5], data_tuple[6]

                utterance_idx = str(utterance_idx).zfill(4)
                utterance_id = f"{speaker_id}-{chapter_id}-{utterance_idx}"
                utterance_path = os.path.join(corpus_root.as_posix(),
                                              args.folder_in_archive, split,
                                              str(speaker_id), str(chapter_id),
                                              utterance_id)

                print(f"{utterance_id} {utterance_path}.flac", file=wave_f)
                print(f"{utterance_id} {text}", file=text_f)
Beispiel #10
0
    def create(tags, cache=True):

        if isinstance(tags, str):
            tags = [tags]
        if isinstance(transforms, list):
            transform_list = transforms
        else:
            transform_list = [transforms]

        data = torch.utils.data.ConcatDataset(
            [
                Processed(
                    LIBRISPEECH(
                        root, tag, folder_in_archive=folder_in_archive, download=False,
                    ),
                    transform,
                    language_model.encode,
                )
                for tag, transform in zip(tags, transform_list)
            ]
        )

        data = MapMemoryCache(data)
        return data
parser.add_argument('--folds_eval', default=[8,9], type=int, nargs='+', help='List of BIRD folds for validation')
parser.add_argument('--folds_test', default=[10], type=int, nargs='+', help='List of BIRD folds for test')

args = parser.parse_args()

# This holds the RIR dataset for training, validation and testing

folder_in_archive_rir = 'Bird'
rir_train = BIRD(root=args.root, folder_in_archive=folder_in_archive_rir, folds=args.folds_train)
rir_eval = BIRD(root=args.root, folder_in_archive=folder_in_archive_rir, folds=args.folds_eval)
rir_test = BIRD(root=args.root, folder_in_archive=folder_in_archive_rir, folds=args.folds_test)

# This holds the speech dataset for training, validation and testing

folder_in_archive_speech = 'LibriSpeech'
speech_train = LIBRISPEECH(root=args.root, folder_in_archive=folder_in_archive_speech, url='train-clean-100', download=True)
speech_eval = LIBRISPEECH(root=args.root, folder_in_archive=folder_in_archive_speech, url='dev-clean', download=True)
speech_test = LIBRISPEECH(root=args.root, folder_in_archive=folder_in_archive_speech, url='test-clean', download=True)

# We can simply create augmented data with this training dataset

augmented_train = RT60(rir=rir_train, speech=speech_train, samples_count=10000)

Ys, rt60 = augmented_train[5]

Y1 = torch.squeeze(Ys[0,:,:,:], dim=0)
Y2 = torch.squeeze(Ys[1,:,:,:], dim=0)

print(rt60)

plt.subplot(2,1,1)
class LibriSpeech(Dataset):
    def __init__(
        self,
        root,
        train=True,
        spectral_transforms=False,
        wavform_transforms=True,
        train_urls=[
            'train-clean-100',
            'train-clean-360',
            'train-other-500',
        ],
        test_url='dev-clean',
        max_length=150526,
        input_size=112,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        # choose to either apply augmentation at wavform or at augmentation level
        assert not (spectral_transforms and wavform_transforms)

        if train:
            datasets = []
            for train_url in train_urls:
                dataset = LIBRISPEECH(root,
                                      url=train_url,
                                      download=True,
                                      folder_in_archive='LibriSpeech')
                datasets.append(dataset)
            self.dataset = ConcatDatasets(datasets)
        else:
            self.dataset = LIBRISPEECH(root,
                                       url=test_url,
                                       download=True,
                                       folder_in_archive='LibriSpeech')

        self.wavform_transforms = wavform_transforms
        self.spectral_transforms = spectral_transforms
        self.max_length = max_length
        self.train = train
        self.input_size = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev
        all_speaker_ids = self.get_speaker_ids()
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers

    def get_speaker_ids(self):
        if self.train:
            speaker_ids = []
            for dataset in self.dataset.datasets:
                speaker_ids_i = self._get_speaker_ids(dataset)
                speaker_ids.append(speaker_ids_i)
            return np.concatenate(speaker_ids)
        else:
            return self._get_speaker_ids(self.dataset)

    def _get_speaker_ids(self, dataset):
        speaker_ids = []
        for i in range(len(dataset)):
            fileid = dataset._walker[i]
            speaker_id = self.load_librispeech_speaker_id(
                fileid,
                dataset._path,
                dataset._ext_audio,
                dataset._ext_txt,
            )
            speaker_ids.append(speaker_id)
        return np.array(speaker_ids)

    def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt):
        speaker_id, _, _ = fileid.split("-")
        return int(speaker_id)

    def __getitem__(self, index):
        if index in BAD_LIBRISPEECH_INDICES:
            index = index + 1

        wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
            index)

        speaker_id = self.speaker_id_map[speaker_id]
        wavform = np.asarray(wavform[0])

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=LIBRISPEECH_HOP_LENGTH_DICT[self.input_size],
            n_mels=self.input_size,
        )

        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, speaker_id

    def __len__(self):
        return len(self.dataset)
class LibriSpeechTransfer(Dataset):
    """
    Divide the dev-clean split of LibriSpeech into train and 
    test splits by speaker so we can train a logreg fairly.
    """
    def __init__(
        self,
        root,
        train=True,
        spectral_transforms=False,
        wavform_transforms=False,
        max_length=150526,
        input_size=112,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        assert not (spectral_transforms and wavform_transforms)
        self.dataset = LIBRISPEECH(root,
                                   url='dev-clean',
                                   download=True,
                                   folder_in_archive='LibriSpeech')

        all_speaker_ids = self.get_speaker_ids(self.dataset)
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
        self.num_labels = num_unique_speakers

        self.indices = self.train_test_split(self.dataset,
                                             all_speaker_ids,
                                             train=train)
        self.spectral_transforms = spectral_transforms
        self.wavform_transforms = wavform_transforms
        self.max_length = max_length
        self.train = train
        self.input_size = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev

    def get_speaker_ids(self, dataset):
        speaker_ids = []
        for i in range(len(dataset)):
            fileid = dataset._walker[i]
            speaker_id = self.load_librispeech_speaker_id(
                fileid,
                dataset._path,
                dataset._ext_audio,
                dataset._ext_txt,
            )
            speaker_ids.append(speaker_id)
        return np.array(speaker_ids)

    def train_test_split(self, dataset, speaker_ids, train=True):
        rs = np.random.RandomState(42)  # fix seed so reproducible splitting

        unique_speaker_ids = sorted(set(speaker_ids))
        unique_speaker_ids = np.array(unique_speaker_ids)

        # train test split to ensure the 80/20 splits
        train_indices, test_indices = [], []
        for speaker_id in unique_speaker_ids:
            speaker_indices = np.where(speaker_ids == speaker_id)[0]
            size = len(speaker_indices)
            rs.shuffle(speaker_indices)
            train_size = int(0.8 * size)
            train_indices.extend(speaker_indices[:train_size].tolist())
            test_indices.extend(speaker_indices[train_size:].tolist())

        return train_indices if train else test_indices

    def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt):
        speaker_id, _, _ = fileid.split("-")
        return int(speaker_id)

    def __getitem__(self, index):
        # NOTE: overwrite index with our custom indices mapping exapmles
        #       to the training and test splits
        index = self.indices[index]

        try:
            wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                index)
        except:
            index2 = (index + 1) % len(self.dataset)
            wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                index2)

        speaker_id = self.speaker_id_map[speaker_id]
        wavform = np.asarray(wavform[0])

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800}
        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=hop_length_dict[self.input_size],
            n_mels=self.input_size,
        )

        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, speaker_id

    def __len__(self):
        return len(self.indices)
Beispiel #14
0
 def __init__(self, path, subset, percent):
     self.libri_dataset = LIBRISPEECH(path, url=subset, download=False)
     if percent != 1.0:
         self.libri_dataset = get_subset(self.libri_dataset, percent)
     self.path = path
Beispiel #15
0
class LibriSpeech(Dataset):
    def __init__(
        self,
        root=DATA_ROOTS['librispeech'],
        train=True,
        small=False,
        spectral_transforms=False,
        wavform_transforms=True,
        test_url='dev-clean',
        max_length=150526,
        input_size=224,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        # choose to either apply augmentation at wavform or at augmentation level
        assert not (spectral_transforms and wavform_transforms)
        if train:
            if small:
                self.dataset = LIBRISPEECH(root,
                                           url='train-clean-100',
                                           download=True,
                                           folder_in_archive='LibriSpeech')
            else:
                self.dataset1 = LIBRISPEECH(root,
                                            url='train-clean-100',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
                self.dataset2 = LIBRISPEECH(root,
                                            url='train-clean-360',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
                self.dataset3 = LIBRISPEECH(root,
                                            url='train-other-500',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
        else:
            self.dataset = LIBRISPEECH(root,
                                       url=test_url,
                                       download=True,
                                       folder_in_archive='LibriSpeech')

        self.spectral_transforms = spectral_transforms
        self.wavform_transforms = wavform_transforms
        self.max_length = max_length
        self.train = train
        self.small = small
        all_speaker_ids = self.get_speaker_ids()
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
        self.num_labels = num_unique_speakers
        self.input_size = input_size
        self.FILTER_SIZE = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev

    def get_speaker_ids(self):
        if self.train and not self.small:
            speaker_ids_1 = self._get_speaker_ids(self.dataset1)
            speaker_ids_2 = self._get_speaker_ids(self.dataset2)
            speaker_ids_3 = self._get_speaker_ids(self.dataset3)
            return np.concatenate(
                [speaker_ids_1, speaker_ids_2, speaker_ids_3])
        else:
            return self._get_speaker_ids(self.dataset)

    def _get_speaker_ids(self, dataset):
        speaker_ids = []
        for i in range(len(dataset)):
            fileid = dataset._walker[i]
            speaker_id = self.load_librispeech_speaker_id(
                fileid,
                dataset._path,
                dataset._ext_audio,
                dataset._ext_txt,
            )
            speaker_ids.append(speaker_id)
        return np.array(speaker_ids)

    def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt):
        speaker_id, _, _ = fileid.split("-")
        return int(speaker_id)

    def __getitem__(self, index):

        if self.train and not self.small:
            if index >= (len(self.dataset1) + len(self.dataset2)):
                try:
                    wavform, sample_rate, _, speaker_id, _, _ = \
                        self.dataset3.__getitem__(index - len(self.dataset1) - len(self.dataset2))
                except:
                    index2 = (index - len(self.dataset1) - len(self.dataset2) +
                              1) % len(self.dataset3)
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset3(
                        index2)
            elif index >= len(self.dataset1):
                try:
                    wavform, sample_rate, _, speaker_id, _, _ = \
                        self.dataset2.__getitem__(index - len(self.dataset1))
                except:
                    index2 = (index - len(self.dataset1) + 1) % len(
                        self.dataset2)
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset2.__getitem__(
                        index2)
            else:
                try:
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__(
                        index)
                except:
                    index2 = (index + 1) % len(self.dataset)
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__(
                        index2)
        else:
            try:
                wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                    index)
            except:
                index2 = (index + 1) % len(self.dataset)
                wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                    index2)

        speaker_id = self.speaker_id_map[speaker_id]
        wavform = np.asarray(wavform[0])

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800}
        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=hop_length_dict[self.input_size],
            n_mels=self.input_size,
        )
        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, speaker_id

    def __len__(self):
        if self.train and not self.small:
            return len(self.dataset1) + len(self.dataset2) + len(self.dataset3)
        else:
            return len(self.dataset)
Beispiel #16
0
def get_dataset(datadir, data_url):
    if not os.path.exists(datadir):
        os.makedirs(datadir)

    dataset = LIBRISPEECH(root=datadir, url=data_url, download=True)
    return dataset