def test_make_cuts_from_recordings_features_supervisions( self, dummy_recording_set, dummy_feature_set, dummy_supervision_set): cut_set = CutSet.from_manifests(recordings=dummy_recording_set, supervisions=dummy_supervision_set, features=dummy_feature_set) cut1 = cut_set[0] assert cut1.start == 0 assert cut1.duration == 10.0 assert cut1.end == 10.0 assert cut1.channel == 0 assert len(cut1.supervisions) == 1 assert cut1.supervisions[0].id == 'sup1' assert cut1.supervisions[0].recording_id == 'rec1' assert cut1.supervisions[0].start == 3.0 assert cut1.supervisions[0].end == 7.0 assert cut1.supervisions[0].channel == 0 assert cut1.supervisions[0].text == 'dummy text' assert cut1.has_recording assert cut1.recording == dummy_recording_set.recordings['rec1'] assert cut1.sampling_rate == 16000 assert cut1.recording_id == 'rec1' assert cut1.num_samples == 160000 assert cut1.has_features assert cut1.features == dummy_feature_set.features[0] assert cut1.frame_shift == 0.01 assert cut1.num_frames == 1000 assert cut1.num_features == 23 assert cut1.features_type == 'fbank'
def random_mixed(supervision_manifest: Pathlike, feature_manifest: Pathlike, output_cut_manifest: Pathlike, snr_range: Tuple[float, float], offset_range: Tuple[float, float]): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains supervision regions from SUPERVISION_MANIFEST and features supplied by FEATURE_MANIFEST. It first creates a trivial CutSet, splits it into two equal, randomized parts and mixes their features. The parameters of the mix are controlled via SNR_RANGE and OFFSET_RANGE. """ supervision_set = SupervisionSet.from_json(supervision_manifest) feature_set = FeatureSet.from_json(feature_manifest) source_cut_set = CutSet.from_manifests(supervisions=supervision_set, features=feature_set) left_cuts, right_cuts = source_cut_set.split(num_splits=2, shuffle=True) snrs = np.random.uniform(*snr_range, size=len(left_cuts)).tolist() relative_offsets = np.random.uniform(*offset_range, size=len(left_cuts)).tolist() mixed_cut_set = CutSet.from_cuts( left_cut.mix(right_cut, offset_other_by=left_cut.duration * relative_offset, snr=snr) for left_cut, right_cut, snr, relative_offset in zip( left_cuts, right_cuts, snrs, relative_offsets)) mixed_cut_set.to_json(output_cut_manifest)
def test_make_cuts_from_recordings_supervisions(self, dummy_recording_set, dummy_supervision_set): cut_set = CutSet.from_manifests(recordings=dummy_recording_set, supervisions=dummy_supervision_set) cut1 = cut_set[0] assert cut1.start == 0 assert cut1.duration == 10.0 assert cut1.end == 10.0 assert cut1.channel == 0 assert len(cut1.supervisions) == 1 assert cut1.supervisions[0].id == "sup1" assert cut1.supervisions[0].recording_id == "rec1" assert cut1.supervisions[0].start == 3.0 assert cut1.supervisions[0].end == 7.0 assert cut1.supervisions[0].channel == 0 assert cut1.supervisions[0].text == "dummy text" assert cut1.has_recording assert cut1.recording == dummy_recording_set.recordings["rec1"] assert cut1.sampling_rate == 16000 assert cut1.recording_id == "rec1" assert cut1.num_samples == 160000 assert not cut1.has_features assert cut1.features is None assert cut1.frame_shift is None assert cut1.num_frames is None assert cut1.num_features is None assert cut1.features_type is None
def __init__( self, cuts: CutSet, uem: Optional[SupervisionSet] = None, min_speaker_dim: Optional[int] = None, global_speaker_ids: bool = False, ) -> None: super().__init__() validate(cuts) if not uem: self.cuts = cuts else: # We use the `overlap` method in intervaltree to get overlapping regions # between the supervision segments and the UEM segments recordings = RecordingSet( {c.recording.id: c.recording for c in cuts if c.has_recording}) uem_intervals = CutSet.from_manifests( recordings=recordings, supervisions=uem, ).index_supervisions() supervisions = [] for cut_id, tree in cuts.index_supervisions().items(): if cut_id not in uem_intervals: supervisions += [it.data for it in tree] continue supervisions += { it.data.trim(it.end, start=it.begin) for uem_it in uem_intervals[cut_id] for it in tree.overlap(begin=uem_it.begin, end=uem_it.end) } self.cuts = CutSet.from_manifests( recordings=recordings, supervisions=SupervisionSet.from_segments(supervisions), ) self.speakers = ({ spk: idx for idx, spk in enumerate(self.cuts.speakers) } if global_speaker_ids else None) self.min_speaker_dim = min_speaker_dim
def test_known_issue_with_overlap(): r = dummy_recording(0) rec = RecordingSet.from_recordings([r]) # Make two segments. The first segment is 1s long. The segment segment # is 0.3 seconds long and lies entirely within the first. Both have the # same recording_id as the single entry in rec. sup = SupervisionSet.from_segments( [ SupervisionSegment( id="utt1", recording_id=r.id, start=0.0, duration=1.0, channel=0, text="Hello", ), SupervisionSegment( id="utt2", recording_id=r.id, start=0.2, duration=0.5, channel=0, text="World", ), ] ) cuts = CutSet.from_manifests(recordings=rec, supervisions=sup) assert len(cuts) == 1 cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False) assert len(cuts_trim) == 2 cut = cuts_trim[0] assert cut.start == 0 assert cut.duration == 1 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 1 assert sup.text == "Hello" cut = cuts_trim[1] assert cut.start == 0.2 assert cut.duration == 0.5 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 0.5 assert sup.text == "World"
def test_make_cuts_from_recordings_features_supervisions( self, dummy_recording_set_lazy, dummy_feature_set_lazy, dummy_supervision_set_lazy, ): with NamedTemporaryFile(suffix=".jsonl.gz") as f: cut_set = CutSet.from_manifests( recordings=dummy_recording_set_lazy, supervisions=dummy_supervision_set_lazy, features=dummy_feature_set_lazy, lazy=True, output_path=f.name, ) f.flush() cut1 = cut_set[0] assert cut1.start == 0 assert cut1.duration == 10.0 assert cut1.end == 10.0 assert cut1.channel == 0 assert len(cut1.supervisions) == 2 assert cut1.supervisions[0].id == "sup1" assert cut1.supervisions[0].recording_id == "rec1" assert cut1.supervisions[0].start == 3.0 assert cut1.supervisions[0].end == 7.0 assert cut1.supervisions[0].channel == 0 assert cut1.supervisions[0].text == "dummy text" assert cut1.supervisions[1].id == "sup2" assert cut1.supervisions[1].recording_id == "rec1" assert cut1.supervisions[1].start == 7.0 assert cut1.supervisions[1].end == 9.0 assert cut1.supervisions[1].channel == 0 assert cut1.supervisions[1].text == "dummy text" assert cut1.has_recording assert cut1.sampling_rate == 16000 assert cut1.recording_id == "rec1" assert cut1.num_samples == 160000 assert cut1.has_features assert cut1.frame_shift == 0.01 assert cut1.num_frames == 1000 assert cut1.num_features == 23 assert cut1.features_type == "fbank"
def simple( output_cut_manifest: Pathlike, recording_manifest: Optional[Pathlike], feature_manifest: Optional[Pathlike], supervision_manifest: Optional[Pathlike], ): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST. Depending on the provided options, it may contain any combination of recording, feature and supervision manifests. Either RECORDING_MANIFEST or FEATURE_MANIFEST has to be provided. When SUPERVISION_MANIFEST is provided, the cuts time span will correspond to that of the supervision segments. Otherwise, that time span corresponds to the one found in features, if available, otherwise recordings. """ supervision_set, feature_set, recording_set = [ load_manifest(p) if p is not None else None for p in (supervision_manifest, feature_manifest, recording_manifest) ] cut_set = CutSet.from_manifests(recordings=recording_set, supervisions=supervision_set, features=feature_set) cut_set.to_file(output_cut_manifest)
def test_make_cuts_from_features(dummy_feature_set): cut_set = CutSet.from_manifests(features=dummy_feature_set) cut1 = cut_set[0] assert cut1.start == 0 assert cut1.duration == 10.0 assert cut1.end == 10.0 assert cut1.channel == 0 assert len(cut1.supervisions) == 0 assert not cut1.has_recording assert cut1.recording is None assert cut1.sampling_rate == 16000 assert cut1.recording_id == 'rec1' assert cut1.num_samples is None assert cut1.has_features assert cut1.features == dummy_feature_set.features[0] assert cut1.frame_shift == 0.01 assert cut1.num_frames == 1000 assert cut1.num_features == 23 assert cut1.features_type == 'fbank'
def test_make_cuts_from_features_with_random_ids(dummy_feature_set): cut_set = CutSet.from_manifests(features=dummy_feature_set, random_ids=True) for idx, cut in enumerate(cut_set): assert cut.id != f'{cut.recording_id}-{idx}-{cut.channel}'
def test_make_cuts_from_recordings_with_deterministic_ids(dummy_recording_set): cut_set = CutSet.from_manifests(recordings=dummy_recording_set, random_ids=False) for idx, cut in enumerate(cut_set): assert cut.id == f'{cut.recording_id}-{idx}-{cut.channel}'