Esempio n. 1
0
def test_feature_set_builder(storage_fn):
    recordings: RecordingSet = RecordingSet.from_json(
        "test/fixtures/audio.json")
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with storage_fn() as storage:
        builder = FeatureSetBuilder(
            feature_extractor=extractor,
            storage=storage,
        )
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

    assert len(feature_set) == 6

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == "kaldi-fbank"
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.config.num_filters
        # assert that the storage type metadata matches
        assert features.storage_type == storage.name
        # assert that the metadata is consistent with the data shapes
        arr = features.load()
        assert arr.shape[0] == features.num_frames
        assert arr.shape[1] == features.num_features
        # assert that the stored features are the same as the "freshly extracted" features
        recording = recordings[features.recording_id]
        expected = extractor.extract(
            samples=recording.load_audio(channels=features.channels),
            sampling_rate=recording.sampling_rate,
        )
        np.testing.assert_almost_equal(arr, expected, decimal=2)

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
Esempio n. 2
0
def extract(recording_manifest: Pathlike, output_dir: Pathlike,
            feature_manifest: Optional[Pathlike], storage_type: str,
            lilcom_tick_power: int, root_dir: Optional[Pathlike],
            num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    recordings: RecordingSet = RecordingSet.from_json(recording_manifest)
    if root_dir is not None:
        recordings = recordings.with_path_prefix(root_dir)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage'

    with get_writer(storage_type)(storage_path,
                                  tick_power=lilcom_tick_power) as storage:
        feature_set_builder = FeatureSetBuilder(
            feature_extractor=feature_extractor,
            storage=storage,
        )
        feature_set_builder.process_and_store_recordings(
            recordings=recordings,
            output_manifest=output_dir / 'feature_manifest.json.gz',
            num_jobs=num_jobs)
Esempio n. 3
0
def extract_cuts(
    cutset: Pathlike,
    output_cutset: Pathlike,
    storage_path: Pathlike,
    feature_manifest: Optional[Pathlike],
    storage_type: str,
    num_jobs: int,
):
    """
    Extract features for cuts in a given CUTSET manifest.
    The features are stored in STORAGE_PATH, and the output manifest
    with features is stored in OUTPUT_CUTSET.
    """
    from lhotse import CutSet

    cuts: CutSet = CutSet.from_file(cutset)
    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())
    cuts = cuts.compute_and_store_features(
        extractor=feature_extractor,
        storage_path=storage_path,
        num_jobs=num_jobs,
        storage_type=get_writer(storage_type),
    )
    Path(output_cutset).parent.mkdir(parents=True, exist_ok=True)
    cuts.to_file(output_cutset)
Esempio n. 4
0
def test_feature_mixer_handles_empty_array():
    # Treat it more like a test of "it runs" rather than "it works"
    sr = 16000
    t = np.linspace(0, 1, sr, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)

    fe = Fbank()
    f1 = fe.extract(x1, sr)
    mixer = FeatureMixer(
        feature_extractor=fe,
        base_feats=f1,
        frame_shift=fe.frame_shift,
    )
    mixer.add_to_mix(np.array([]), sampling_rate=sr)

    fmix_feat = mixer.mixed_feats
    np.testing.assert_equal(fmix_feat, f1)
Esempio n. 5
0
def test_on_the_fly_feature_extraction_unsupervised_dataset_with_augmentation(libri_cut_set):
    tested_dataset = DynamicUnsupervisedDataset(
        feature_extractor=Fbank(),
        cuts=libri_cut_set,
        augmenter=WavAugmenter.create_predefined('reverb', sampling_rate=16000)
    )
    # Just test that it runs
    tested_feats = tested_dataset[0]
Esempio n. 6
0
def feature_extractor() -> TorchaudioFeatureExtractor:
    """
    Set up the feature extractor for TTS task.
    :return: A feature extractor with custom parameters.
    """
    feature_extractor = Fbank()
    feature_extractor.config.num_mel_bins = 80

    return feature_extractor
Esempio n. 7
0
def test_feature_mixer_handles_empty_array_with_offset():
    # Treat it more like a test of "it runs" rather than "it works"
    sr = 16000
    t = np.linspace(0, 1, sr, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)

    fe = Fbank()
    f1 = fe.extract(x1, sr)
    mixer = FeatureMixer(
        feature_extractor=fe,
        base_feats=f1,
        frame_shift=fe.frame_shift,
    )
    mixer.add_to_mix(np.array([]), sampling_rate=sr, offset=0.5)

    fmix_feat = mixer.mixed_feats
    # time 0s - 1s: identical values
    np.testing.assert_equal(fmix_feat[:100], f1)
    # time 1s - 1.5s: padding
    np.testing.assert_equal(fmix_feat[100:], -1000)
Esempio n. 8
0
def test_on_the_fly_feature_extraction_unsupervised_dataset(libri_cut_set):
    ref_dataset = UnsupervisedDataset(libri_cut_set)
    tested_dataset = DynamicUnsupervisedDataset(feature_extractor=Fbank(),
                                                cuts=libri_cut_set)
    ref_feats = ref_dataset[0]
    tested_feats = tested_dataset[0]
    # Note: comparison to 1 decimal fails.
    #       I'm assuming this is due to lilcom's compression.
    #       Pytest outputs looks like the following:
    # E       Mismatched elements: 4 / 23000 (0.0174%)
    # E       Max absolute difference: 0.46469784
    # E       Max relative difference: 0.6171043
    # E        x: array([[-11.5, -11.4,  -9.9, ...,  -5.5,  -6.5,  -7.4],
    # E              [-13.2, -11.2,  -9.6, ...,  -5.6,  -6.5,  -7.6],
    # E              [-12. , -10.1, -10.1, ...,  -5.8,  -7. ,  -7.8],...
    # E        y: array([[-11.5, -11.4,  -9.9, ...,  -5.5,  -6.5,  -7.4],
    # E              [-13.2, -11.2,  -9.6, ...,  -5.6,  -6.5,  -7.6],
    # E              [-12. , -10.1, -10.1, ...,  -5.8,  -7. ,  -7.8],...
    np.testing.assert_array_almost_equal(ref_feats, tested_feats, decimal=0)
Esempio n. 9
0
def test_feature_set_builder_with_augmentation():
    recordings: RecordingSet = RecordingSet.from_json(
        'test/fixtures/audio.json')
    augment_fn = WavAugmenter.create_predefined('pitch_reverb_tdrop',
                                                sampling_rate=8000)
    extractor = Fbank()
    with TemporaryDirectory() as d, LilcomFilesWriter(d) as storage:
        builder = FeatureSetBuilder(feature_extractor=extractor,
                                    storage=storage,
                                    augment_fn=augment_fn)
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

        assert len(feature_set) == 6

        feature_infos = list(feature_set)

        # Assert the properties shared by all features
        for features in feature_infos:
            # assert that fbank is the default feature type
            assert features.type == 'fbank'
            # assert that duration is always a multiple of frame_shift
            assert features.num_frames == round(features.duration /
                                                features.frame_shift)
            # assert that num_features is preserved
            assert features.num_features == builder.feature_extractor.config.num_mel_bins
            # assert that the storage type metadata matches
            assert features.storage_type == storage.name
            # assert that the metadata is consistent with the data shapes
            arr = features.load()
            assert arr.shape[0] == features.num_frames
            assert arr.shape[1] == features.num_features

        # Assert the properties for recordings of duration 0.5 seconds
        for features in feature_infos[:2]:
            assert features.num_frames == 50
            assert features.duration == 0.5

        # Assert the properties for recordings of duration 1.0 seconds
        for features in feature_infos[2:]:
            assert features.num_frames == 100
            assert features.duration == 1.0
Esempio n. 10
0
def extract_cuts_batch(
    cutset: Pathlike,
    output_cutset: Pathlike,
    storage_path: Pathlike,
    feature_manifest: Optional[Pathlike],
    storage_type: str,
    num_jobs: int,
    batch_duration: Seconds,
):
    """
    Extract features for cuts in a given CUTSET manifest.
    The features are stored in STORAGE_PATH, and the output manifest
    with features is stored in OUTPUT_CUTSET.

    This version enables CUDA acceleration for feature extractors
    that support it (e.g., kaldifeat extractors).

    \b
    Example usage of kaldifeat fbank with CUDA:

        $ pip install kaldifeat  # note: ensure it's compiled with CUDA

        $ lhotse feat write-default-config -f kaldifeat-fbank feat.yml

        $ sed 's/device: cpu/device: cuda/' feat.yml feat-cuda.yml

        $ lhotse feat extract-cuts-batch -f feat-cuda.yml cuts.jsonl cuts_with_feats.jsonl feats.h5
    """
    from lhotse import CutSet

    cuts: CutSet = CutSet.from_file(cutset)
    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())
    cuts = cuts.compute_and_store_features_batch(
        extractor=feature_extractor,
        storage_path=storage_path,
        batch_duration=batch_duration,
        num_workers=num_jobs,
        storage_type=get_writer(storage_type),
    )
    Path(output_cutset).parent.mkdir(parents=True, exist_ok=True)
    cuts.to_file(output_cutset)
Esempio n. 11
0
def extract(audio_manifest: Pathlike, output_dir: Pathlike,
            segmentation_manifest: Optional[Pathlike], augmentation: str,
            feature_manifest: Optional[Pathlike], compressed: bool,
            lilcom_tick_power: int, root_dir: Optional[Pathlike],
            num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    audio_set = RecordingSet.from_json(audio_manifest)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    # TODO: to be used (actually, only the segmentation info will be used, and all supervision info will be ignored)
    supervision_set = (SupervisionSet.from_json(segmentation_manifest)
                       if segmentation_manifest is not None else None)

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)

    augmenter = None
    if augmentation is not None:
        sampling_rate = next(iter(audio_set)).sampling_rate
        assert all(rec.sampling_rate == sampling_rate for rec in audio_set), \
            "Wav augmentation effect chains expect all the recordings to have the same sampling rate at this time."
        augmenter = WavAugmenter.create_predefined(name=augmentation,
                                                   sampling_rate=sampling_rate)

    feature_set_builder = FeatureSetBuilder(
        feature_extractor=feature_extractor,
        output_dir=output_dir,
        root_dir=root_dir,
        augmenter=augmenter)
    feature_set_builder.process_and_store_recordings(
        recordings=audio_set,
        segmentation=None,  # TODO: implement and use
        compressed=compressed,
        lilcom_tick_power=lilcom_tick_power,
        num_jobs=num_jobs)
Esempio n. 12
0
def extract(recording_manifest: Pathlike, output_dir: Pathlike,
            augmentation: str, feature_manifest: Optional[Pathlike],
            storage_type: str, lilcom_tick_power: int,
            root_dir: Optional[Pathlike], num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    recordings: RecordingSet = RecordingSet.from_json(recording_manifest)
    if root_dir is not None:
        recordings = recordings.with_path_prefix(root_dir)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage'

    augmenter = None
    if augmentation is not None:
        sampling_rate = next(iter(recordings)).sampling_rate
        assert all(rec.sampling_rate == sampling_rate for rec in recordings), \
            "Wav augmentation effect chains expect all the recordings to have the same sampling rate at this time."
        augmenter = WavAugmenter.create_predefined(name=augmentation,
                                                   sampling_rate=sampling_rate)

    with get_writer(storage_type)(storage_path,
                                  tick_power=lilcom_tick_power) as storage:
        feature_set_builder = FeatureSetBuilder(
            feature_extractor=feature_extractor,
            storage=storage,
            augmenter=augmenter)
        feature_set_builder.process_and_store_recordings(
            recordings=recordings,
            output_manifest=output_dir / 'feature_manifest.json.gz',
            num_jobs=num_jobs)
Esempio n. 13
0
def test_feature_set_builder(augmentation):
    audio_set = RecordingSet.from_json('test/fixtures/audio.json')
    augmenter = WavAugmenter.create_predefined(
        augmentation, sampling_rate=8000) if augmentation is not None else None
    with TemporaryDirectory() as output_dir:
        builder = FeatureSetBuilder(feature_extractor=Fbank(),
                                    output_dir=output_dir,
                                    augmenter=augmenter)
        feature_set = builder.process_and_store_recordings(
            recordings=audio_set)

    assert len(feature_set) == 6

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == 'fbank'
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.config.num_mel_bins
        # assert that lilcom is the default storate type
        assert features.storage_type == 'lilcom'

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
Esempio n. 14
0
        frame_length=frame_length,
        frame_shift=frame_shift) == expected_num_frames)


def test_add_feature_sets():
    expected = DummyManifest(FeatureSet, begin_id=0, end_id=10)
    feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5)
    feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10)
    combined = feature_set_1 + feature_set_2
    assert combined == expected


@pytest.mark.parametrize(
    ["feature_extractor", "decimal", "exception_expectation"],
    [
        (Fbank(FbankConfig(num_mel_bins=40)), 0, does_not_raise()),
        (Spectrogram(), -1, does_not_raise()),
        (Mfcc(), None, raises(ValueError)),
    ],
)
def test_mixer(feature_extractor, decimal, exception_expectation):
    # Treat it more like a test of "it runs" rather than "it works"
    sr = 8000
    t = np.linspace(0, 1, 8000, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)
    x2 = np.sin(55.0 * t).reshape(1, -1)

    f1 = feature_extractor.extract(x1, sr)
    f2 = feature_extractor.extract(x2, sr)
    with exception_expectation:
        mixer = FeatureMixer(
Esempio n. 15
0
        frame_length=frame_length,
        frame_shift=frame_shift) == expected_num_frames)


def test_add_feature_sets():
    expected = DummyManifest(FeatureSet, begin_id=0, end_id=10)
    feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5)
    feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10)
    combined = feature_set_1 + feature_set_2
    assert combined == expected


@pytest.mark.parametrize(
    ["feature_extractor", "decimal", "exception_expectation"],
    [
        (Fbank(FbankConfig(num_filters=40,
                           sampling_rate=8000)), 0, does_not_raise()),
        (Spectrogram(), -1, does_not_raise()),
        (Mfcc(MfccConfig(sampling_rate=8000)), None, raises(ValueError)),
    ],
)
def test_mixer(feature_extractor, decimal, exception_expectation):
    # Treat it more like a test of "it runs" rather than "it works"
    sr = 8000
    t = np.linspace(0, 1, 8000, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)
    x2 = np.sin(55.0 * t).reshape(1, -1)

    f1 = feature_extractor.extract(x1, sr)
    f2 = feature_extractor.extract(x2, sr)
    with exception_expectation:
        mixer = FeatureMixer(
Esempio n. 16
0
def test_feature_extractor_generic_deserialization():
    fe = Fbank()
    with NamedTemporaryFile() as f:
        fe.to_yaml(f.name)
        fe_deserialized = FeatureExtractor.from_yaml(f.name)
    assert fe_deserialized.config == fe.config
Esempio n. 17
0
        time_diff=time_diff,
        frame_length=frame_length,
        frame_shift=frame_shift) == expected_num_frames


def test_add_feature_sets():
    expected = DummyManifest(FeatureSet, begin_id=0, end_id=10)
    feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5)
    feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10)
    combined = feature_set_1 + feature_set_2
    assert combined == expected


@pytest.mark.parametrize(
    ['feature_extractor', 'decimal', 'exception_expectation'], [
        (Fbank(), 0, does_not_raise()),
        (Spectrogram(), -1, does_not_raise()),
        (Mfcc(), None, raises(ValueError)),
    ])
def test_mixer(feature_extractor, decimal, exception_expectation):
    # Treat it more like a test of "it runs" rather than "it works"
    t = np.linspace(0, 1, 8000, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)
    x2 = np.sin(55.0 * t).reshape(1, -1)

    f1 = feature_extractor.extract(x1, 8000)
    f2 = feature_extractor.extract(x2, 8000)
    with exception_expectation:
        mixer = FeatureMixer(
            feature_extractor=feature_extractor,
            base_feats=f1,