Exemple #1
0
def prepare_musan(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    parts: Sequence[str] = ("music", "speech", "noise"),
    use_vocals: bool = True,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if not parts:
        raise ValueError("No MUSAN parts specified for manifest preparation.")
    if isinstance(parts, str):
        parts = [parts]

    manifests = {}
    if "music" in parts:
        manifests["music"] = prepare_music(corpus_dir, use_vocals=use_vocals)
        validate_recordings_and_supervisions(**manifests["music"])
    if "speech" in parts:
        manifests["speech"] = {"recordings": scan_recordings(corpus_dir / "speech")}
        validate(manifests["speech"]["recordings"])
    if "noise" in parts:
        manifests["noise"] = {"recordings": scan_recordings(corpus_dir / "noise")}
        validate(manifests["noise"]["recordings"])

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in manifests:
            for key, manifest in manifests[part].items():
                manifest.to_file(output_dir / f"musan_{key}_{part}.jsonl.gz")

    return manifests
Exemple #2
0
 def _validate(self) -> None:
     validate(self.cuts)
     for cut in self.cuts:
         for supervision in cut.supervisions:
             assert (cut.start - 1e-5) <= supervision.start <= supervision.end <= (cut.end + 1e-5), \
                 f"Cutting in the middle of a supervision is currently not supported for the ASR task. " \
                 f"Cut ID violating the pre-condition: '{cut.id}'"
Exemple #3
0
def prepare_musan(
        corpus_dir: Pathlike,
        output_dir: Optional[Pathlike] = None,
        parts: Sequence[str] = ('music', 'speech', 'noise'),
        use_vocals: bool = True,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if not parts:
        raise ValueError("No MUSAN parts specified for manifest preparation.")
    if isinstance(parts, str):
        parts = [parts]

    manifests = {}
    if 'music' in parts:
        manifests['music'] = prepare_music(corpus_dir, use_vocals=use_vocals)
        validate_recordings_and_supervisions(**manifests['music'])
    if 'speech' in parts:
        manifests['speech'] = {'recordings': scan_recordings(corpus_dir / 'speech')}
        validate(manifests['speech']['recordings'])
    if 'noise' in parts:
        manifests['noise'] = {'recordings': scan_recordings(corpus_dir / 'noise')}
        validate(manifests['noise']['recordings'])

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in manifests:
            for key, manifest in manifests[part].items():
                manifest.to_json(output_dir / f'{key}_{part}.json')

    return manifests
Exemple #4
0
    def __init__(
        self,
        cuts: CutSet,
        cut_transforms: List[Callable[[CutSet], CutSet]] = None,
        feature_input_strategy: InputStrategy = PrecomputedFeatures(),
        feature_transforms: Union[Sequence[Callable], Callable] = None,
        add_eos: bool = True,
        add_bos: bool = True,
    ) -> None:
        super().__init__()

        validate(cuts)
        for cut in cuts:
            assert (len(cut.supervisions) == 1
                    ), "Only the Cuts with single supervision are supported."

        self.cuts = cuts
        self.token_collater = TokenCollater(cuts,
                                            add_eos=add_eos,
                                            add_bos=add_bos)
        self.cut_transforms = ifnone(cut_transforms, [])
        self.feature_input_strategy = feature_input_strategy

        if feature_transforms is None:
            feature_transforms = []
        elif not isinstance(feature_transforms, Sequence):
            feature_transforms = [feature_transforms]

        assert all(isinstance(transform, Callable) for transform in feature_transforms), \
            "Feature transforms must be Callable"
        self.feature_transforms = feature_transforms
Exemple #5
0
def test_cut_set_batch_feature_extraction_resume(cut_set, overwrite):
    # This test checks that we can keep writing to the same file
    # and the previously written results are not lost.
    # Since we don't have an easy way to interrupt the execution in a test,
    # we just write another CutSet to the same file.
    # The effect is the same.
    extractor = Fbank()
    cut_set = cut_set.resample(16000)
    subsets = cut_set.split(num_splits=2)
    processed = []
    with NamedTemporaryFile() as feat_f, NamedTemporaryFile(
            suffix=".jsonl.gz") as manifest_f:
        for cuts in subsets:
            processed.append(
                cuts.compute_and_store_features_batch(
                    extractor=extractor,
                    storage_path=feat_f.name,
                    manifest_path=manifest_f.name,
                    num_workers=0,
                    overwrite=overwrite,
                ))
        feat_f.flush()
        manifest_f.flush()
        merged = load_manifest(manifest_f.name)
        if overwrite:
            assert list(merged.ids) == list(subsets[-1].ids)
        else:
            assert list(merged.ids) == list(cut_set.ids)
        validate(merged, read_data=True)
Exemple #6
0
 def validate(self):
     validate(self.sources_set)
     validate(self.mixtures_set)
     # Make sure it's possible to iterate through the whole dataset and resolve the sources for each mixture
     for cut in self.mixtures_set.mixed_cuts.values():
         _, source_cuts = self._obtain_mixture(cut.id)
         assert len(source_cuts) > 1
Exemple #7
0
 def __init__(
     self,
     cuts: CutSet,
 ):
     super().__init__()
     validate(cuts)
     self.cuts = cuts
     self.cut_ids = list(cuts.ids)
Exemple #8
0
def test_cut_set_batch_feature_extraction(cut_set, extractor_type):
    extractor = extractor_type()
    cut_set = cut_set.resample(16000)
    with NamedTemporaryFile() as tmpf:
        cut_set_with_feats = cut_set.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=tmpf.name,
            num_workers=0,
        )
        validate(cut_set_with_feats, read_data=True)
Exemple #9
0
 def _validate(self) -> None:
     validate(self.cuts)
     tol = 1e-3  # 1ms
     for cut in self.cuts:
         for supervision in cut.supervisions:
             assert supervision.start >= -tol, f"Supervisions starting before the cut are not supported for ASR" \
                                               f" (sup id: {supervision.id}, cut id: {cut.id})"
             assert supervision.duration <= cut.duration + tol, f"Supervisions ending after the cut " \
                                                                f"are not supported for ASR" \
                                                                f" (sup id: {supervision.id}, cut id: {cut.id})"
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly):
    mixed_cut = cut.append(cut)
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage:
        cut_with_feats = mixed_cut.compute_and_store_features(
            extractor=extractor, storage=storage, mix_eagerly=mix_eagerly)
        validate(cut_with_feats)
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 200
    assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
Exemple #11
0
def validate_for_asr(cuts: CutSet) -> None:
    validate(cuts)
    tol = 2e-3  # 1ms
    for cut in cuts:
        for supervision in cut.supervisions:
            assert supervision.start >= -tol, (
                f"Supervisions starting before the cut are not supported for ASR"
                f" (sup id: {supervision.id}, cut id: {cut.id})")
            assert supervision.duration <= cut.duration + tol, (
                f"Supervisions ending after the cut "
                f"are not supported for ASR"
                f" (sup id: {supervision.id}, cut id: {cut.id})")
Exemple #12
0
 def __init__(
         self,
         cuts: CutSet,
         input_strategy: InputStrategy = PrecomputedFeatures(),
         cut_transforms: Sequence[Callable[[CutSet], CutSet]] = None,
         input_transforms: Sequence[Callable[[torch.Tensor], torch.Tensor]] = None
 ) -> None:
     super().__init__()
     validate(cuts)
     self.cuts = cuts
     self.input_strategy = input_strategy
     self.cut_transforms = ifnone(cut_transforms, [])
     self.input_transforms = ifnone(input_transforms, [])
Exemple #13
0
 def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
     validate(cuts)
     cuts = cuts.sort_by_duration()
     for tfnm in self.cut_transforms:
         cuts = tfnm(cuts)
     inputs, input_lens = self.input_strategy(cuts)
     for tfnm in self.input_transforms:
         inputs = tfnm(inputs)
     return {
         "inputs": inputs,
         "input_lens": input_lens,
         "is_voice": self.input_strategy.supervision_masks(cuts),
         "cut": cuts,
     }
Exemple #14
0
 def __init__(
     self,
     cuts: CutSet,
     min_speaker_dim: Optional[int] = None,
     global_speaker_ids: bool = False,
 ) -> None:
     super().__init__()
     validate(cuts)
     self.cuts = cuts
     self.speakers = {
         spk: idx
         for idx, spk in enumerate(self.cuts.speakers)
     } if global_speaker_ids else None
     self.min_speaker_dim = min_speaker_dim
Exemple #15
0
def test_cut_set_batch_feature_extraction_manifest_path(
        cut_set, suffix, exception_expectation):
    extractor = Fbank()
    cut_set = cut_set.resample(16000)
    with NamedTemporaryFile() as feat_f, NamedTemporaryFile(
            suffix=suffix) as manifest_f:
        with exception_expectation:
            cut_set_with_feats = cut_set.compute_and_store_features_batch(
                extractor=extractor,
                storage_path=feat_f.name,
                manifest_path=manifest_f.name,
                num_workers=0,
            )
            validate(cut_set_with_feats, read_data=True)
Exemple #16
0
    def __init__(self, cuts: CutSet, root_dir: Optional[Pathlike] = None):
        super().__init__()
        validate(cuts)
        self.cuts = cuts
        self.root_dir = Path(root_dir) if root_dir else None
        self.cut_ids = list(self.cuts.ids)

        # generate tokens from text
        self.id_to_token = {}
        self.token_set = set()
        for cut in cuts:
            assert len(
                cut.supervisions
            ) == 1, 'Only the Cuts with single supervision are supported.'
            characters = list(cut.supervisions[0].text)
            self.token_set.update(set(characters))
            self.id_to_token[cut.id] = characters
        self.token_set = sorted(list(self.tokens))
Exemple #17
0
 def __init__(
     self,
     cuts: CutSet,
     uem: Optional[SupervisionSet] = None,
     min_speaker_dim: Optional[int] = None,
     global_speaker_ids: bool = False,
 ) -> None:
     super().__init__()
     validate(cuts)
     if not uem:
         self.cuts = cuts
     else:
         # We use the `overlap` method in intervaltree to get overlapping regions
         # between the supervision segments and the UEM segments
         recordings = RecordingSet(
             {c.recording.id: c.recording
              for c in cuts if c.has_recording})
         uem_intervals = CutSet.from_manifests(
             recordings=recordings,
             supervisions=uem,
         ).index_supervisions()
         supervisions = []
         for cut_id, tree in cuts.index_supervisions().items():
             if cut_id not in uem_intervals:
                 supervisions += [it.data for it in tree]
                 continue
             supervisions += {
                 it.data.trim(it.end, start=it.begin)
                 for uem_it in uem_intervals[cut_id]
                 for it in tree.overlap(begin=uem_it.begin, end=uem_it.end)
             }
         self.cuts = CutSet.from_manifests(
             recordings=recordings,
             supervisions=SupervisionSet.from_segments(supervisions),
         )
     self.speakers = ({
         spk: idx
         for idx, spk in enumerate(self.cuts.speakers)
     } if global_speaker_ids else None)
     self.min_speaker_dim = min_speaker_dim
Exemple #18
0
def test_validate_cut_with_temporal_array(caplog):
    # Note: "caplog" is a special variable in pytest that captures logs.
    caplog.set_level(logging.WARNING)
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(
            f.name) as writer:
        cut = MonoCut(
            id="cut1",
            start=0,
            duration=4.9,
            channel=0,
            recording=dummy_recording(1),
        )
        alignment = np.random.randint(500, size=131)
        cut.alignment = writer.store_array(key="utt1",
                                           value=alignment,
                                           frame_shift=0.4,
                                           temporal_dim=0)
        validate(cut)

    assert ("MonoCut cut1: possibly mismatched duration between cut (4.9s) "
            "and temporal array in custom field 'alignment' (num_frames=131 "
            "* frame_shift=0.4 == duration=52.400000000000006)" in caplog.text)
Exemple #19
0
 def validate(self):
     super().validate()
     validate(self.nonsources_set)
Exemple #20
0
def validate_(manifest: Pathlike, read_data: bool):
    """Validate a Lhotse manifest file."""
    from lhotse import load_manifest, validate

    data = load_manifest(manifest)
    validate(data, read_data=read_data)
Exemple #21
0
def validate_(manifest: Pathlike, read_data: bool):
    """Validate a Lhotse manifest file."""
    data = load_manifest(manifest)
    validate(data, read_data=read_data)
Exemple #22
0
 def _validate(self):
     validate(self.cuts)
     assert all(cut.has_features for cut in self.cuts)
def validate_for_tts(cuts: CutSet) -> None:
    validate(cuts)
    for cut in cuts:
        assert (len(cut.supervisions) == 1
                ), "Only the Cuts with single supervision are supported."
Exemple #24
0
 def _validate(self):
     validate(self.cuts)
     assert all(cut.has_recording for cut in self.cuts)
Exemple #25
0
def test_validate_feature_set_runs():
    features = DummyManifest(FeatureSet, begin_id=0, end_id=100)
    validate(features)
Exemple #26
0
 def _validate(self, cuts: CutSet) -> None:
     validate(cuts)
     assert all(cut.has_features for cut in cuts)
Exemple #27
0
 def _validate(self, cuts: CutSet) -> None:
     validate(cuts)
     assert all(cut.has_recording for cut in cuts)
Exemple #28
0
 def _validate(self) -> None:
     validate(self.cuts)