def test_feature_set_builder(storage_fn): recordings: RecordingSet = RecordingSet.from_json( "test/fixtures/audio.json") extractor = Fbank(FbankConfig(sampling_rate=8000)) with storage_fn() as storage: builder = FeatureSetBuilder( feature_extractor=extractor, storage=storage, ) feature_set = builder.process_and_store_recordings( recordings=recordings) assert len(feature_set) == 6 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == "kaldi-fbank" # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.config.num_filters # assert that the storage type metadata matches assert features.storage_type == storage.name # assert that the metadata is consistent with the data shapes arr = features.load() assert arr.shape[0] == features.num_frames assert arr.shape[1] == features.num_features # assert that the stored features are the same as the "freshly extracted" features recording = recordings[features.recording_id] expected = extractor.extract( samples=recording.load_audio(channels=features.channels), sampling_rate=recording.sampling_rate, ) np.testing.assert_almost_equal(arr, expected, decimal=2) # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
def extract(recording_manifest: Pathlike, output_dir: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, lilcom_tick_power: int, root_dir: Optional[Pathlike], num_jobs: int): """ Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR, with one file per recording (or segment). """ recordings: RecordingSet = RecordingSet.from_json(recording_manifest) if root_dir is not None: recordings = recordings.with_path_prefix(root_dir) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage' with get_writer(storage_type)(storage_path, tick_power=lilcom_tick_power) as storage: feature_set_builder = FeatureSetBuilder( feature_extractor=feature_extractor, storage=storage, ) feature_set_builder.process_and_store_recordings( recordings=recordings, output_manifest=output_dir / 'feature_manifest.json.gz', num_jobs=num_jobs)
def extract_cuts( cutset: Pathlike, output_cutset: Pathlike, storage_path: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, num_jobs: int, ): """ Extract features for cuts in a given CUTSET manifest. The features are stored in STORAGE_PATH, and the output manifest with features is stored in OUTPUT_CUTSET. """ from lhotse import CutSet cuts: CutSet = CutSet.from_file(cutset) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) cuts = cuts.compute_and_store_features( extractor=feature_extractor, storage_path=storage_path, num_jobs=num_jobs, storage_type=get_writer(storage_type), ) Path(output_cutset).parent.mkdir(parents=True, exist_ok=True) cuts.to_file(output_cutset)
def test_feature_mixer_handles_empty_array(): # Treat it more like a test of "it runs" rather than "it works" sr = 16000 t = np.linspace(0, 1, sr, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) fe = Fbank() f1 = fe.extract(x1, sr) mixer = FeatureMixer( feature_extractor=fe, base_feats=f1, frame_shift=fe.frame_shift, ) mixer.add_to_mix(np.array([]), sampling_rate=sr) fmix_feat = mixer.mixed_feats np.testing.assert_equal(fmix_feat, f1)
def test_on_the_fly_feature_extraction_unsupervised_dataset_with_augmentation(libri_cut_set): tested_dataset = DynamicUnsupervisedDataset( feature_extractor=Fbank(), cuts=libri_cut_set, augmenter=WavAugmenter.create_predefined('reverb', sampling_rate=16000) ) # Just test that it runs tested_feats = tested_dataset[0]
def feature_extractor() -> TorchaudioFeatureExtractor: """ Set up the feature extractor for TTS task. :return: A feature extractor with custom parameters. """ feature_extractor = Fbank() feature_extractor.config.num_mel_bins = 80 return feature_extractor
def test_feature_mixer_handles_empty_array_with_offset(): # Treat it more like a test of "it runs" rather than "it works" sr = 16000 t = np.linspace(0, 1, sr, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) fe = Fbank() f1 = fe.extract(x1, sr) mixer = FeatureMixer( feature_extractor=fe, base_feats=f1, frame_shift=fe.frame_shift, ) mixer.add_to_mix(np.array([]), sampling_rate=sr, offset=0.5) fmix_feat = mixer.mixed_feats # time 0s - 1s: identical values np.testing.assert_equal(fmix_feat[:100], f1) # time 1s - 1.5s: padding np.testing.assert_equal(fmix_feat[100:], -1000)
def test_on_the_fly_feature_extraction_unsupervised_dataset(libri_cut_set): ref_dataset = UnsupervisedDataset(libri_cut_set) tested_dataset = DynamicUnsupervisedDataset(feature_extractor=Fbank(), cuts=libri_cut_set) ref_feats = ref_dataset[0] tested_feats = tested_dataset[0] # Note: comparison to 1 decimal fails. # I'm assuming this is due to lilcom's compression. # Pytest outputs looks like the following: # E Mismatched elements: 4 / 23000 (0.0174%) # E Max absolute difference: 0.46469784 # E Max relative difference: 0.6171043 # E x: array([[-11.5, -11.4, -9.9, ..., -5.5, -6.5, -7.4], # E [-13.2, -11.2, -9.6, ..., -5.6, -6.5, -7.6], # E [-12. , -10.1, -10.1, ..., -5.8, -7. , -7.8],... # E y: array([[-11.5, -11.4, -9.9, ..., -5.5, -6.5, -7.4], # E [-13.2, -11.2, -9.6, ..., -5.6, -6.5, -7.6], # E [-12. , -10.1, -10.1, ..., -5.8, -7. , -7.8],... np.testing.assert_array_almost_equal(ref_feats, tested_feats, decimal=0)
def test_feature_set_builder_with_augmentation(): recordings: RecordingSet = RecordingSet.from_json( 'test/fixtures/audio.json') augment_fn = WavAugmenter.create_predefined('pitch_reverb_tdrop', sampling_rate=8000) extractor = Fbank() with TemporaryDirectory() as d, LilcomFilesWriter(d) as storage: builder = FeatureSetBuilder(feature_extractor=extractor, storage=storage, augment_fn=augment_fn) feature_set = builder.process_and_store_recordings( recordings=recordings) assert len(feature_set) == 6 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == 'fbank' # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.config.num_mel_bins # assert that the storage type metadata matches assert features.storage_type == storage.name # assert that the metadata is consistent with the data shapes arr = features.load() assert arr.shape[0] == features.num_frames assert arr.shape[1] == features.num_features # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
def extract_cuts_batch( cutset: Pathlike, output_cutset: Pathlike, storage_path: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, num_jobs: int, batch_duration: Seconds, ): """ Extract features for cuts in a given CUTSET manifest. The features are stored in STORAGE_PATH, and the output manifest with features is stored in OUTPUT_CUTSET. This version enables CUDA acceleration for feature extractors that support it (e.g., kaldifeat extractors). \b Example usage of kaldifeat fbank with CUDA: $ pip install kaldifeat # note: ensure it's compiled with CUDA $ lhotse feat write-default-config -f kaldifeat-fbank feat.yml $ sed 's/device: cpu/device: cuda/' feat.yml feat-cuda.yml $ lhotse feat extract-cuts-batch -f feat-cuda.yml cuts.jsonl cuts_with_feats.jsonl feats.h5 """ from lhotse import CutSet cuts: CutSet = CutSet.from_file(cutset) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) cuts = cuts.compute_and_store_features_batch( extractor=feature_extractor, storage_path=storage_path, batch_duration=batch_duration, num_workers=num_jobs, storage_type=get_writer(storage_type), ) Path(output_cutset).parent.mkdir(parents=True, exist_ok=True) cuts.to_file(output_cutset)
def extract(audio_manifest: Pathlike, output_dir: Pathlike, segmentation_manifest: Optional[Pathlike], augmentation: str, feature_manifest: Optional[Pathlike], compressed: bool, lilcom_tick_power: int, root_dir: Optional[Pathlike], num_jobs: int): """ Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR, with one file per recording (or segment). """ audio_set = RecordingSet.from_json(audio_manifest) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) # TODO: to be used (actually, only the segmentation info will be used, and all supervision info will be ignored) supervision_set = (SupervisionSet.from_json(segmentation_manifest) if segmentation_manifest is not None else None) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) augmenter = None if augmentation is not None: sampling_rate = next(iter(audio_set)).sampling_rate assert all(rec.sampling_rate == sampling_rate for rec in audio_set), \ "Wav augmentation effect chains expect all the recordings to have the same sampling rate at this time." augmenter = WavAugmenter.create_predefined(name=augmentation, sampling_rate=sampling_rate) feature_set_builder = FeatureSetBuilder( feature_extractor=feature_extractor, output_dir=output_dir, root_dir=root_dir, augmenter=augmenter) feature_set_builder.process_and_store_recordings( recordings=audio_set, segmentation=None, # TODO: implement and use compressed=compressed, lilcom_tick_power=lilcom_tick_power, num_jobs=num_jobs)
def extract(recording_manifest: Pathlike, output_dir: Pathlike, augmentation: str, feature_manifest: Optional[Pathlike], storage_type: str, lilcom_tick_power: int, root_dir: Optional[Pathlike], num_jobs: int): """ Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR, with one file per recording (or segment). """ recordings: RecordingSet = RecordingSet.from_json(recording_manifest) if root_dir is not None: recordings = recordings.with_path_prefix(root_dir) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage' augmenter = None if augmentation is not None: sampling_rate = next(iter(recordings)).sampling_rate assert all(rec.sampling_rate == sampling_rate for rec in recordings), \ "Wav augmentation effect chains expect all the recordings to have the same sampling rate at this time." augmenter = WavAugmenter.create_predefined(name=augmentation, sampling_rate=sampling_rate) with get_writer(storage_type)(storage_path, tick_power=lilcom_tick_power) as storage: feature_set_builder = FeatureSetBuilder( feature_extractor=feature_extractor, storage=storage, augmenter=augmenter) feature_set_builder.process_and_store_recordings( recordings=recordings, output_manifest=output_dir / 'feature_manifest.json.gz', num_jobs=num_jobs)
def test_feature_set_builder(augmentation): audio_set = RecordingSet.from_json('test/fixtures/audio.json') augmenter = WavAugmenter.create_predefined( augmentation, sampling_rate=8000) if augmentation is not None else None with TemporaryDirectory() as output_dir: builder = FeatureSetBuilder(feature_extractor=Fbank(), output_dir=output_dir, augmenter=augmenter) feature_set = builder.process_and_store_recordings( recordings=audio_set) assert len(feature_set) == 6 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == 'fbank' # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.config.num_mel_bins # assert that lilcom is the default storate type assert features.storage_type == 'lilcom' # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
frame_length=frame_length, frame_shift=frame_shift) == expected_num_frames) def test_add_feature_sets(): expected = DummyManifest(FeatureSet, begin_id=0, end_id=10) feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5) feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10) combined = feature_set_1 + feature_set_2 assert combined == expected @pytest.mark.parametrize( ["feature_extractor", "decimal", "exception_expectation"], [ (Fbank(FbankConfig(num_mel_bins=40)), 0, does_not_raise()), (Spectrogram(), -1, does_not_raise()), (Mfcc(), None, raises(ValueError)), ], ) def test_mixer(feature_extractor, decimal, exception_expectation): # Treat it more like a test of "it runs" rather than "it works" sr = 8000 t = np.linspace(0, 1, 8000, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) x2 = np.sin(55.0 * t).reshape(1, -1) f1 = feature_extractor.extract(x1, sr) f2 = feature_extractor.extract(x2, sr) with exception_expectation: mixer = FeatureMixer(
frame_length=frame_length, frame_shift=frame_shift) == expected_num_frames) def test_add_feature_sets(): expected = DummyManifest(FeatureSet, begin_id=0, end_id=10) feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5) feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10) combined = feature_set_1 + feature_set_2 assert combined == expected @pytest.mark.parametrize( ["feature_extractor", "decimal", "exception_expectation"], [ (Fbank(FbankConfig(num_filters=40, sampling_rate=8000)), 0, does_not_raise()), (Spectrogram(), -1, does_not_raise()), (Mfcc(MfccConfig(sampling_rate=8000)), None, raises(ValueError)), ], ) def test_mixer(feature_extractor, decimal, exception_expectation): # Treat it more like a test of "it runs" rather than "it works" sr = 8000 t = np.linspace(0, 1, 8000, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) x2 = np.sin(55.0 * t).reshape(1, -1) f1 = feature_extractor.extract(x1, sr) f2 = feature_extractor.extract(x2, sr) with exception_expectation: mixer = FeatureMixer(
def test_feature_extractor_generic_deserialization(): fe = Fbank() with NamedTemporaryFile() as f: fe.to_yaml(f.name) fe_deserialized = FeatureExtractor.from_yaml(f.name) assert fe_deserialized.config == fe.config
time_diff=time_diff, frame_length=frame_length, frame_shift=frame_shift) == expected_num_frames def test_add_feature_sets(): expected = DummyManifest(FeatureSet, begin_id=0, end_id=10) feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5) feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10) combined = feature_set_1 + feature_set_2 assert combined == expected @pytest.mark.parametrize( ['feature_extractor', 'decimal', 'exception_expectation'], [ (Fbank(), 0, does_not_raise()), (Spectrogram(), -1, does_not_raise()), (Mfcc(), None, raises(ValueError)), ]) def test_mixer(feature_extractor, decimal, exception_expectation): # Treat it more like a test of "it runs" rather than "it works" t = np.linspace(0, 1, 8000, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) x2 = np.sin(55.0 * t).reshape(1, -1) f1 = feature_extractor.extract(x1, 8000) f2 = feature_extractor.extract(x2, 8000) with exception_expectation: mixer = FeatureMixer( feature_extractor=feature_extractor, base_feats=f1,