def deserialize_item(data: dict) -> Any: # Figures out what type of manifest is being decoded with some heuristics # and returns a Lhotse manifest object rather than a raw dict. from lhotse import Features, MonoCut, Recording, SupervisionSegment from lhotse.array import deserialize_array from lhotse.cut import MixedCut if "shape" in data or "array" in data: return deserialize_array(data) if "sources" in data: return Recording.from_dict(data) if "num_features" in data: return Features.from_dict(data) if "type" not in data: return SupervisionSegment.from_dict(data) cut_type = data.pop("type") if cut_type == "MonoCut": return MonoCut.from_dict(data) if cut_type == "Cut": warnings.warn( "Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. " "Please re-generate it with Lhotse v0.8 as it might stop working in a future version " "(using manifest.from_file() and then manifest.to_file() should be sufficient)." ) return MonoCut.from_dict(data) if cut_type == "MixedCut": return MixedCut.from_dict(data) raise ValueError( f"Unexpected cut type during deserialization: '{cut_type}'")
def cut_with_relative_paths(): return Cut('cut', 0, 10, 0, features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000, storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc', start=0, duration=10), recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0) )
def _upload_one(item: Features, url: str) -> Features: feats_mtx = item.load() feats_writer = LilcomURLWriter(url) new_key = feats_writer.write(key=item.storage_key, value=feats_mtx) return fastcopy(item, storage_path=url, storage_key=new_key, storage_type=feats_writer.name)
def test_validate_features_consistent_num_frames_does_not_raise(): manifest = Features( type='irrelevant', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=1.0, storage_type='irrelevant', storage_path='irrelevant', storage_key='irrelevant', ) validate_features(manifest)
def test_validate_features_inconsistent_num_frames_raises(): manifest = Features( type='irrelevant', num_frames=101, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=1.0, storage_type='irrelevant', storage_path='irrelevant', storage_key='irrelevant', ) with pytest.raises(AssertionError): validate_features(manifest)
def feature_set(): return FeatureSet(features=[ Features(recording_id='irrelevant', channels=0, start=0.0, duration=20.0, type='fbank', num_frames=2000, num_features=20, frame_shift=0.01, sampling_rate=16000, storage_type='lilcom', storage_path='/irrelevant/', storage_key='path.llc') ])
def cut_set(): cut = MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=Features( type="fbank", num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type="lilcom", storage_path="irrelevant", storage_key="irrelevant", ), recording=Recording( id="rec-1", sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type="file", channels=[0], source="irrelevant") ], ), supervisions=[ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), ], ) return CutSet.from_cuts([ cut, fastcopy(cut, id="cut-nosup", supervisions=[]), fastcopy(cut, id="cut-norec", recording=None), fastcopy(cut, id="cut-nofeat", features=None), cut.pad(duration=30.0, direction="left"), cut.pad(duration=30.0, direction="right"), cut.pad(duration=30.0, direction="both"), cut.mix(cut, offset_other_by=5.0, snr=8), ])
def cut_set(): cut = Cut(id='cut-1', start=0.0, duration=10.0, channel=0, features=Features( type='fbank', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type='lilcom', storage_path='irrelevant', storage_key='irrelevant', ), recording=Recording(id='rec-1', sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type='file', channels=[0], source='irrelevant') ]), supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ]) return CutSet.from_cuts([ cut, fastcopy(cut, id='cut-nosup', supervisions=[]), fastcopy(cut, id='cut-norec', recording=None), fastcopy(cut, id='cut-nofeat', features=None), cut.pad(duration=30.0, direction='left'), cut.pad(duration=30.0, direction='right'), cut.pad(duration=30.0, direction='both'), cut.mix(cut, offset_other_by=5.0, snr=8) ])
def feature_set(): return FeatureSet(features=[ Features( recording_id="irrelevant", channels=0, start=0.0, duration=20.0, type="fbank", num_frames=2000, num_features=20, frame_shift=0.01, sampling_rate=16000, storage_type="lilcom", storage_path="/irrelevant/", storage_key="path.llc", ) ])
def deserialize_item(data: dict) -> Any: # Figures out what type of manifest is being decoded with some heuristics # and returns a Lhotse manifest object rather than a raw dict. from lhotse import Cut, Features, Recording, SupervisionSegment from lhotse.cut import MixedCut data = arr2list_recursive(data) if 'sources' in data: return Recording.from_dict(data) if 'num_features' in data: return Features.from_dict(data) if 'type' not in data: return SupervisionSegment.from_dict(data) cut_type = data.pop('type') if cut_type == 'Cut': return Cut.from_dict(data) if cut_type == 'MixedCut': return MixedCut.from_dict(data) raise ValueError(f"Unexpected cut type during deserialization: '{cut_type}'")
def cut_with_relative_paths(): return MonoCut( "cut", 0, 10, 0, features=Features( type="fbank", num_frames=1000, num_features=40, sampling_rate=8000, storage_type="lilcom_files", storage_path="storage_dir", storage_key="feats.llc", start=0, duration=10, frame_shift=0.01, ), recording=Recording("rec", [AudioSource("file", [0], "audio.wav")], 8000, 80000, 10.0), )
def deserialize_item(data: dict) -> Any: # Figures out what type of manifest is being decoded with some heuristics # and returns a Lhotse manifest object rather than a raw dict. from lhotse import MonoCut, Features, Recording, SupervisionSegment from lhotse.cut import MixedCut data = arr2list_recursive(data) if 'sources' in data: return Recording.from_dict(data) if 'num_features' in data: return Features.from_dict(data) if 'type' not in data: return SupervisionSegment.from_dict(data) cut_type = data.pop('type') if cut_type == 'MonoCut': return MonoCut.from_dict(data) if cut_type == 'Cut': warnings.warn('Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. ' 'Please re-generate it with Lhotse v0.8 as it might stop working in a future version ' '(using manifest.from_file() and then manifest.to_file() should be sufficient).') return MonoCut.from_dict(data) if cut_type == 'MixedCut': return MixedCut.from_dict(data) raise ValueError(f"Unexpected cut type during deserialization: '{cut_type}'")
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) reco2dur = path / 'reco2dur' if not reco2dur.is_file(): raise ValueError( f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>" ) with reco2dur.open() as f: for line in f: recording_id, dur = line.strip().split() durations[recording_id] = float(dur) recording_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=int(durations[recording_id] * sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / 'segments' if segments.is_file(): with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / 'feats.scp' if feats_scp.exists() and is_module_available('kaldiio'): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features(type='kaldiio', num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: " f"frame_shift must be not None. " f"Feature import omitted.") return recording_set, supervision_set, feature_set
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd.endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items() ) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [sup_string.strip().split() for sup_string in f] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate ), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments ) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldi_native_io"): if frame_shift is not None: import kaldi_native_io from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldi_native_io", num_frames=mat_shape.num_rows, num_features=mat_shape.num_cols, frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat_shape.num_rows * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat_shape in kaldi_native_io.SequentialMatrixShapeReader( f"scp:{feats_scp}" ) ) else: warnings.warn( "Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted." ) return recording_set, supervision_set, feature_set