def prepare_musan( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, parts: Sequence[str] = ("music", "speech", "noise"), use_vocals: bool = True, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if not parts: raise ValueError("No MUSAN parts specified for manifest preparation.") if isinstance(parts, str): parts = [parts] manifests = {} if "music" in parts: manifests["music"] = prepare_music(corpus_dir, use_vocals=use_vocals) validate_recordings_and_supervisions(**manifests["music"]) if "speech" in parts: manifests["speech"] = {"recordings": scan_recordings(corpus_dir / "speech")} validate(manifests["speech"]["recordings"]) if "noise" in parts: manifests["noise"] = {"recordings": scan_recordings(corpus_dir / "noise")} validate(manifests["noise"]["recordings"]) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in manifests: for key, manifest in manifests[part].items(): manifest.to_file(output_dir / f"musan_{key}_{part}.jsonl.gz") return manifests
def _validate(self) -> None: validate(self.cuts) for cut in self.cuts: for supervision in cut.supervisions: assert (cut.start - 1e-5) <= supervision.start <= supervision.end <= (cut.end + 1e-5), \ f"Cutting in the middle of a supervision is currently not supported for the ASR task. " \ f"Cut ID violating the pre-condition: '{cut.id}'"
def prepare_musan( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, parts: Sequence[str] = ('music', 'speech', 'noise'), use_vocals: bool = True, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if not parts: raise ValueError("No MUSAN parts specified for manifest preparation.") if isinstance(parts, str): parts = [parts] manifests = {} if 'music' in parts: manifests['music'] = prepare_music(corpus_dir, use_vocals=use_vocals) validate_recordings_and_supervisions(**manifests['music']) if 'speech' in parts: manifests['speech'] = {'recordings': scan_recordings(corpus_dir / 'speech')} validate(manifests['speech']['recordings']) if 'noise' in parts: manifests['noise'] = {'recordings': scan_recordings(corpus_dir / 'noise')} validate(manifests['noise']['recordings']) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in manifests: for key, manifest in manifests[part].items(): manifest.to_json(output_dir / f'{key}_{part}.json') return manifests
def __init__( self, cuts: CutSet, cut_transforms: List[Callable[[CutSet], CutSet]] = None, feature_input_strategy: InputStrategy = PrecomputedFeatures(), feature_transforms: Union[Sequence[Callable], Callable] = None, add_eos: bool = True, add_bos: bool = True, ) -> None: super().__init__() validate(cuts) for cut in cuts: assert (len(cut.supervisions) == 1 ), "Only the Cuts with single supervision are supported." self.cuts = cuts self.token_collater = TokenCollater(cuts, add_eos=add_eos, add_bos=add_bos) self.cut_transforms = ifnone(cut_transforms, []) self.feature_input_strategy = feature_input_strategy if feature_transforms is None: feature_transforms = [] elif not isinstance(feature_transforms, Sequence): feature_transforms = [feature_transforms] assert all(isinstance(transform, Callable) for transform in feature_transforms), \ "Feature transforms must be Callable" self.feature_transforms = feature_transforms
def test_cut_set_batch_feature_extraction_resume(cut_set, overwrite): # This test checks that we can keep writing to the same file # and the previously written results are not lost. # Since we don't have an easy way to interrupt the execution in a test, # we just write another CutSet to the same file. # The effect is the same. extractor = Fbank() cut_set = cut_set.resample(16000) subsets = cut_set.split(num_splits=2) processed = [] with NamedTemporaryFile() as feat_f, NamedTemporaryFile( suffix=".jsonl.gz") as manifest_f: for cuts in subsets: processed.append( cuts.compute_and_store_features_batch( extractor=extractor, storage_path=feat_f.name, manifest_path=manifest_f.name, num_workers=0, overwrite=overwrite, )) feat_f.flush() manifest_f.flush() merged = load_manifest(manifest_f.name) if overwrite: assert list(merged.ids) == list(subsets[-1].ids) else: assert list(merged.ids) == list(cut_set.ids) validate(merged, read_data=True)
def validate(self): validate(self.sources_set) validate(self.mixtures_set) # Make sure it's possible to iterate through the whole dataset and resolve the sources for each mixture for cut in self.mixtures_set.mixed_cuts.values(): _, source_cuts = self._obtain_mixture(cut.id) assert len(source_cuts) > 1
def __init__( self, cuts: CutSet, ): super().__init__() validate(cuts) self.cuts = cuts self.cut_ids = list(cuts.ids)
def test_cut_set_batch_feature_extraction(cut_set, extractor_type): extractor = extractor_type() cut_set = cut_set.resample(16000) with NamedTemporaryFile() as tmpf: cut_set_with_feats = cut_set.compute_and_store_features_batch( extractor=extractor, storage_path=tmpf.name, num_workers=0, ) validate(cut_set_with_feats, read_data=True)
def _validate(self) -> None: validate(self.cuts) tol = 1e-3 # 1ms for cut in self.cuts: for supervision in cut.supervisions: assert supervision.start >= -tol, f"Supervisions starting before the cut are not supported for ASR" \ f" (sup id: {supervision.id}, cut id: {cut.id})" assert supervision.duration <= cut.duration + tol, f"Supervisions ending after the cut " \ f"are not supported for ASR" \ f" (sup id: {supervision.id}, cut id: {cut.id})"
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly): mixed_cut = cut.append(cut) extractor = Fbank(FbankConfig(sampling_rate=8000)) with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage: cut_with_feats = mixed_cut.compute_and_store_features( extractor=extractor, storage=storage, mix_eagerly=mix_eagerly) validate(cut_with_feats) arr = cut_with_feats.load_features() assert arr.shape[0] == 200 assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
def validate_for_asr(cuts: CutSet) -> None: validate(cuts) tol = 2e-3 # 1ms for cut in cuts: for supervision in cut.supervisions: assert supervision.start >= -tol, ( f"Supervisions starting before the cut are not supported for ASR" f" (sup id: {supervision.id}, cut id: {cut.id})") assert supervision.duration <= cut.duration + tol, ( f"Supervisions ending after the cut " f"are not supported for ASR" f" (sup id: {supervision.id}, cut id: {cut.id})")
def __init__( self, cuts: CutSet, input_strategy: InputStrategy = PrecomputedFeatures(), cut_transforms: Sequence[Callable[[CutSet], CutSet]] = None, input_transforms: Sequence[Callable[[torch.Tensor], torch.Tensor]] = None ) -> None: super().__init__() validate(cuts) self.cuts = cuts self.input_strategy = input_strategy self.cut_transforms = ifnone(cut_transforms, []) self.input_transforms = ifnone(input_transforms, [])
def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]: validate(cuts) cuts = cuts.sort_by_duration() for tfnm in self.cut_transforms: cuts = tfnm(cuts) inputs, input_lens = self.input_strategy(cuts) for tfnm in self.input_transforms: inputs = tfnm(inputs) return { "inputs": inputs, "input_lens": input_lens, "is_voice": self.input_strategy.supervision_masks(cuts), "cut": cuts, }
def __init__( self, cuts: CutSet, min_speaker_dim: Optional[int] = None, global_speaker_ids: bool = False, ) -> None: super().__init__() validate(cuts) self.cuts = cuts self.speakers = { spk: idx for idx, spk in enumerate(self.cuts.speakers) } if global_speaker_ids else None self.min_speaker_dim = min_speaker_dim
def test_cut_set_batch_feature_extraction_manifest_path( cut_set, suffix, exception_expectation): extractor = Fbank() cut_set = cut_set.resample(16000) with NamedTemporaryFile() as feat_f, NamedTemporaryFile( suffix=suffix) as manifest_f: with exception_expectation: cut_set_with_feats = cut_set.compute_and_store_features_batch( extractor=extractor, storage_path=feat_f.name, manifest_path=manifest_f.name, num_workers=0, ) validate(cut_set_with_feats, read_data=True)
def __init__(self, cuts: CutSet, root_dir: Optional[Pathlike] = None): super().__init__() validate(cuts) self.cuts = cuts self.root_dir = Path(root_dir) if root_dir else None self.cut_ids = list(self.cuts.ids) # generate tokens from text self.id_to_token = {} self.token_set = set() for cut in cuts: assert len( cut.supervisions ) == 1, 'Only the Cuts with single supervision are supported.' characters = list(cut.supervisions[0].text) self.token_set.update(set(characters)) self.id_to_token[cut.id] = characters self.token_set = sorted(list(self.tokens))
def __init__( self, cuts: CutSet, uem: Optional[SupervisionSet] = None, min_speaker_dim: Optional[int] = None, global_speaker_ids: bool = False, ) -> None: super().__init__() validate(cuts) if not uem: self.cuts = cuts else: # We use the `overlap` method in intervaltree to get overlapping regions # between the supervision segments and the UEM segments recordings = RecordingSet( {c.recording.id: c.recording for c in cuts if c.has_recording}) uem_intervals = CutSet.from_manifests( recordings=recordings, supervisions=uem, ).index_supervisions() supervisions = [] for cut_id, tree in cuts.index_supervisions().items(): if cut_id not in uem_intervals: supervisions += [it.data for it in tree] continue supervisions += { it.data.trim(it.end, start=it.begin) for uem_it in uem_intervals[cut_id] for it in tree.overlap(begin=uem_it.begin, end=uem_it.end) } self.cuts = CutSet.from_manifests( recordings=recordings, supervisions=SupervisionSet.from_segments(supervisions), ) self.speakers = ({ spk: idx for idx, spk in enumerate(self.cuts.speakers) } if global_speaker_ids else None) self.min_speaker_dim = min_speaker_dim
def test_validate_cut_with_temporal_array(caplog): # Note: "caplog" is a special variable in pytest that captures logs. caplog.set_level(logging.WARNING) with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer( f.name) as writer: cut = MonoCut( id="cut1", start=0, duration=4.9, channel=0, recording=dummy_recording(1), ) alignment = np.random.randint(500, size=131) cut.alignment = writer.store_array(key="utt1", value=alignment, frame_shift=0.4, temporal_dim=0) validate(cut) assert ("MonoCut cut1: possibly mismatched duration between cut (4.9s) " "and temporal array in custom field 'alignment' (num_frames=131 " "* frame_shift=0.4 == duration=52.400000000000006)" in caplog.text)
def validate(self): super().validate() validate(self.nonsources_set)
def validate_(manifest: Pathlike, read_data: bool): """Validate a Lhotse manifest file.""" from lhotse import load_manifest, validate data = load_manifest(manifest) validate(data, read_data=read_data)
def validate_(manifest: Pathlike, read_data: bool): """Validate a Lhotse manifest file.""" data = load_manifest(manifest) validate(data, read_data=read_data)
def _validate(self): validate(self.cuts) assert all(cut.has_features for cut in self.cuts)
def validate_for_tts(cuts: CutSet) -> None: validate(cuts) for cut in cuts: assert (len(cut.supervisions) == 1 ), "Only the Cuts with single supervision are supported."
def _validate(self): validate(self.cuts) assert all(cut.has_recording for cut in self.cuts)
def test_validate_feature_set_runs(): features = DummyManifest(FeatureSet, begin_id=0, end_id=100) validate(features)
def _validate(self, cuts: CutSet) -> None: validate(cuts) assert all(cut.has_features for cut in cuts)
def _validate(self, cuts: CutSet) -> None: validate(cuts) assert all(cut.has_recording for cut in cuts)
def _validate(self) -> None: validate(self.cuts)