def test_feature_set_serialization(format, compressed): feature_set = FeatureSet( features=[ Features( recording_id='irrelevant', channels=0, start=0.0, duration=20.0, type='fbank', num_frames=2000, num_features=20, sampling_rate=16000, storage_type='lilcom', storage_path='/irrelevant/', storage_key='path.llc' ) ] ) with NamedTemporaryFile(suffix='.gz' if compressed else '') as f: if format == 'json': feature_set.to_json(f.name) feature_set_deserialized = FeatureSet.from_json(f.name) if format == 'yaml': feature_set.to_yaml(f.name) feature_set_deserialized = FeatureSet.from_yaml(f.name) assert feature_set_deserialized == feature_set
def random_mixed(supervision_manifest: Pathlike, feature_manifest: Pathlike, output_cut_manifest: Pathlike, snr_range: Tuple[float, float], offset_range: Tuple[float, float]): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains supervision regions from SUPERVISION_MANIFEST and features supplied by FEATURE_MANIFEST. It first creates a trivial CutSet, splits it into two equal, randomized parts and mixes their features. The parameters of the mix are controlled via SNR_RANGE and OFFSET_RANGE. """ supervision_set = SupervisionSet.from_json(supervision_manifest) feature_set = FeatureSet.from_json(feature_manifest) source_cut_set = CutSet.from_manifests(supervisions=supervision_set, features=feature_set) left_cuts, right_cuts = source_cut_set.split(num_splits=2, shuffle=True) snrs = np.random.uniform(*snr_range, size=len(left_cuts)).tolist() relative_offsets = np.random.uniform(*offset_range, size=len(left_cuts)).tolist() mixed_cut_set = CutSet.from_cuts( left_cut.mix(right_cut, offset_other_by=left_cut.duration * relative_offset, snr=snr) for left_cut, right_cut, snr, relative_offset in zip( left_cuts, right_cuts, snrs, relative_offsets)) mixed_cut_set.to_json(output_cut_manifest)
def test_feature_set_serialization(): feature_set = FeatureSet(feature_extractor=FeatureExtractor(), features=[ Features(recording_id='irrelevant', channel_id=0, start=0.0, duration=20.0, type='fbank', num_frames=2000, num_features=20, storage_type='lilcom', storage_path='/irrelevant/path.llc') ]) with NamedTemporaryFile() as f: feature_set.to_yaml(f.name) feature_set_deserialized = FeatureSet.from_yaml(f.name) assert feature_set_deserialized == feature_set
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest: if type_ == RecordingSet: return RecordingSet.from_recordings( dummy_recording(idx) for idx in range(begin_id, end_id)) if type_ == SupervisionSet: return SupervisionSet.from_segments( dummy_supervision(idx) for idx in range(begin_id, end_id)) if type_ == FeatureSet: # noinspection PyTypeChecker return FeatureSet.from_features( dummy_features(idx) for idx in range(begin_id, end_id))
def dummy_feature_set_lazy(): with NamedTemporaryFile(suffix=".jsonl.gz") as f: feats = FeatureSet.from_features([ Features( recording_id="rec1", channels=0, start=0, duration=10, type="fbank", num_frames=1000, num_features=23, sampling_rate=16000, storage_type="lilcom_files", storage_path="feats", storage_key="dummy.llc", frame_shift=0.01, ) ]) feats.to_file(f.name) f.flush() yield FeatureSet.from_jsonl_lazy(f.name)
def dummy_feature_set(): return FeatureSet.from_features([ Features(recording_id='rec1', channels=0, start=0, duration=10, type='fbank', num_frames=1000, num_features=23, sampling_rate=16000, storage_type='lilcom', storage_path='dummy.llc') ])
def test_feature_set_prefix_path(): features = FeatureSet.from_features([ Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=16000, storage_type='lilcom', storage_path='feats/12345.llc', start=0, duration=10) ]) for feat in features.with_path_prefix('/data'): assert feat.storage_path == '/data/feats/12345.llc'
def windowed(feature_manifest: Pathlike, output_cut_manifest: Pathlike, cut_duration: float, cut_shift: Optional[float], keep_shorter_windows: bool): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST from feature regions in FEATURE_MANIFEST. The feature matrices are traversed in windows with CUT_SHIFT increments, creating cuts of constant CUT_DURATION. """ feature_set = FeatureSet.from_json(feature_manifest) cut_set = make_windowed_cuts_from_features( feature_set=feature_set, cut_duration=cut_duration, cut_shift=cut_shift, keep_shorter_windows=keep_shorter_windows) cut_set.to_json(output_cut_manifest)
def split(manifest: Manifest, num_splits: int, randomize: bool = False) -> List[Manifest]: """Split a manifest into `num_splits` equal parts. The element order can be randomized.""" num_items = len(manifest) if num_splits > num_items: raise ValueError( f"Cannot split manifest into more chunks ({num_splits}) than its number of items {num_items}" ) chunk_size = int(ceil(num_items / num_splits)) split_indices = [(i * chunk_size, min(num_items, (i + 1) * chunk_size)) for i in range(num_splits)] def maybe_randomize(items: Iterable[Any]) -> List[Any]: items = list(items) if randomize: random.shuffle(items) return items if isinstance(manifest, RecordingSet): contents = maybe_randomize(manifest.recordings.items()) return [ RecordingSet(recordings=dict(contents[begin:end])) for begin, end in split_indices ] if isinstance(manifest, SupervisionSet): contents = maybe_randomize(manifest.segments.items()) return [ SupervisionSet(segments=dict(contents[begin:end])) for begin, end in split_indices ] if isinstance(manifest, FeatureSet): contents = maybe_randomize(manifest.features) return [ FeatureSet(features=contents[begin:end], feature_extractor=manifest.feature_extractor) for begin, end in split_indices ] if isinstance(manifest, CutSet): contents = maybe_randomize(manifest.cuts.items()) return [ CutSet(cuts=dict(contents[begin:end])) for begin, end in split_indices ] raise ValueError(f"Unknown type of manifest: {type(manifest)}")
def test_load_features(recording_id: str, channel: int, start: float, duration: float, exception_expectation, expected_num_frames: Optional[float]): # just test that it loads feature_set = FeatureSet.from_json( 'test/fixtures/dummy_feats/feature_manifest.json') with exception_expectation: features = feature_set.load(recording_id, channel_id=channel, start=start, duration=duration) # expect a matrix assert len(features.shape) == 2 # expect time as the first dimension assert features.shape[0] == expected_num_frames
def simple( feature_manifest: Pathlike, output_cut_manifest: Pathlike, supervision_manifest: Optional[Pathlike], ): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains the regions and features supplied by FEATURE_MANIFEST. Optionally it can use a SUPERVISION_MANIFEST to select the regions and attach the corresponding supervisions to the cuts. This is the simplest way to create Cuts. """ feature_set = FeatureSet.from_yaml(feature_manifest) if supervision_manifest is None: cut_set = make_cuts_from_features(feature_set) else: supervision_set = SupervisionSet.from_yaml(supervision_manifest) cut_set = make_cuts_from_supervisions(feature_set=feature_set, supervision_set=supervision_set) cut_set.to_yaml(output_cut_manifest)
def dummy_feature_set(): return FeatureSet.from_features([ Features( recording_id="rec1", channels=0, start=0, duration=10, type="fbank", num_frames=1000, num_features=23, sampling_rate=16000, storage_type="lilcom_files", storage_path="feats", storage_key="dummy.llc", frame_shift=0.01, ) ])
def test_feature_set_prefix_path(): features = FeatureSet.from_features([ Features( type="fbank", num_frames=1000, num_features=40, frame_shift=0.01, sampling_rate=16000, storage_type="lilcom", storage_path="feats/", storage_key="12345.llc", start=0, duration=10, ) ]) for feat in features.with_path_prefix("/data"): assert feat.storage_path == "/data/feats"
def make_cuts_from_supervisions(supervision_set: SupervisionSet, feature_set: FeatureSet) -> CutSet: """ Utility that converts a SupervisionSet to a CutSet without any adjustment of the segment boundaries. It attaches the relevant features from the corresponding FeatureSet. """ return CutSet.from_cuts( Cut(id=str(uuid4()), start=supervision.start, duration=supervision.duration, features=feature_set.find( recording_id=supervision.recording_id, channel_id=supervision.channel_id, start=supervision.start, duration=supervision.duration, ), supervisions=[supervision]) for idx, supervision in enumerate(supervision_set))
def test_compute_global_stats(): feature_set = FeatureSet.from_json('test/fixtures/dummy_feats/feature_manifest.json') with NamedTemporaryFile() as f: stats = feature_set.compute_global_stats(storage_path=f.name) f.flush() read_stats = pickle.load(f) # Post-condition 1: feature dim is consistent assert stats['norm_means'].shape == (feature_set[0].num_features,) assert stats['norm_stds'].shape == (feature_set[0].num_features,) # Post-condition 2: the iterative method yields very close results to # the "standard" method. true_means = np.mean(np.concatenate([f.load() for f in feature_set]), axis=0) true_stds = np.std(np.concatenate([f.load() for f in feature_set]), axis=0) np.testing.assert_almost_equal(stats['norm_means'], true_means, decimal=5) np.testing.assert_almost_equal(stats['norm_stds'], true_stds, decimal=5) # Post-condition 3: the serialization works correctly assert (stats['norm_means'] == read_stats['norm_means']).all() assert (stats['norm_stds'] == read_stats['norm_stds']).all()
def test_load_features(recording_id: str, channel: int, start: float, duration: float, exception_expectation): # just test that it loads feature_set = FeatureSet.from_yaml( 'test/fixtures/dummy_feats/feature_manifest.yml') with exception_expectation: features = feature_set.load(recording_id, channel_id=channel, start=start, duration=duration) # expect a matrix assert len(features.shape) == 2 # expect time as the first dimension frame_shift = feature_set.feature_extractor.spectrogram_config.frame_shift if duration is not None: # left-hand expression ignores the frame_length - "maximize" the number of frames retained # also, allow a lee-way of +/- 2 frames assert duration / frame_shift == features.shape[0] # expect frequency as the second dimension assert feature_set.feature_extractor.mfcc_fbank_common_config.num_mel_bins == features.shape[ 1]
def test_load_features_with_default_arguments(): feature_set = FeatureSet.from_json( "test/fixtures/dummy_feats/feature_manifest.json") features = feature_set.load("recording-1") assert features.shape == (50, 23)
def test_load_features_with_default_arguments(): feature_set = FeatureSet.from_yaml( 'test/fixtures/dummy_feats/feature_manifest.yml') features = feature_set.load('recording-1')
def libri_features_set(): return FeatureSet.from_json('test/fixtures/libri/feature_manifest.json.gz')
def feature_set(): return FeatureSet( features=[features("rec-1", 0.0, 600.0), features("rec-2", 0.0, 357.0)] )
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [ sup_string.strip().split() for sup_string in f ] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldi_native_io"): if frame_shift is not None: import kaldi_native_io from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldi_native_io", num_frames=mat_shape.num_rows, num_features=mat_shape.num_cols, frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat_shape.num_rows * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[fix_id(utt_id)]. recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat_shape in kaldi_native_io. SequentialMatrixShapeReader(f"scp:{feats_scp}")) else: warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted.") return recording_set, supervision_set, feature_set
def feature_set(): return FeatureSet(features=[ features('rec-1', 0.0, 600.0), features('rec-2', 0.0, 357.0) ])
def feature_set(): return FeatureSet(feature_extractor=FeatureExtractor(), features=[ features('rec-1', 0.0, 600.0), features('rec-2', 0.0, 357.0) ])