Beispiel #1
0
def prepare_single_partition(
    raw_manifest_path: Path,
    corpus_dir: Path,
    speaker_id: str,
    clean_or_other: str,
):
    recordings = []
    supervisions = []
    for meta in load_jsonl(raw_manifest_path):
        recording = Recording.from_file(corpus_dir / meta["audio_filepath"])
        recordings.append(recording)
        supervisions.append(
            SupervisionSegment(
                id=recording.id,
                recording_id=recording.id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=meta["text"],
                speaker=ID2SPEAKER[speaker_id],
                gender=ID2GENDER[speaker_id],
                custom={
                    "text_punct": meta["text_normalized"],
                    "split": clean_or_other
                },
            ))
    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)
    return recordings, supervisions
Beispiel #2
0
 def with_cut(
         self,
         sampling_rate: int,
         num_samples: int,
         features: bool = True,
         supervision: bool = False
 ) -> Cut:
     duration = num_samples / sampling_rate
     cut = Cut(
         id=str(uuid4()),
         start=0,
         duration=duration,
         channel=0,
         recording=self.with_recording(sampling_rate=sampling_rate, num_samples=num_samples)
     )
     if features:
         cut = self._with_features(cut)
     if supervision:
         cut.supervisions.append(SupervisionSegment(
             id=f'sup-{cut.id}',
             recording_id=cut.recording_id,
             start=0,
             duration=cut.duration,
             text='irrelevant'
         ))
     return cut
Beispiel #3
0
def test_extend_by_cut_with_supervision(
    cut_start,
    cut_duration,
    extend_duration,
    extend_direction,
    supervision_start,
    supervision_duration,
    expected_start,
    expected_end,
):
    recording = dummy_recording(int(uuid4()), duration=1.0)
    supervisions = SupervisionSet.from_segments([
        SupervisionSegment(
            id=int(uuid4()),
            recording_id=recording.id,
            start=supervision_start,
            duration=supervision_duration,
        )
    ])
    cut = dummy_cut(int(uuid4()),
                    start=cut_start,
                    duration=cut_duration,
                    supervisions=supervisions)
    extended_cut = cut.extend_by(duration=extend_duration,
                                 direction=extend_direction)
    assert isclose(extended_cut.supervisions[0].start, expected_start)
    assert isclose(extended_cut.supervisions[0].end, expected_end)
Beispiel #4
0
def _filename_to_supervisions(filename: Path, language: str):
    lines = filename.read_text()
    recoid = filename.stem.split(".")[0]
    supervisions = []
    filterfun = partial(_filter_word)
    for start, end, line in _parse_vtt(lines, "<noise>"):
        line_list = []
        for w in line.split():
            w_ = w.strip()
            if re.match(r"^(\([^)]*\) *)+$", w_):
                line_list.append(w_)
            elif filterfun(w):
                line_list.append(w_)
            else:
                line_list.append("<unk>")
        line_ = " ".join(line_list)
        if re.match(r"^\w+ *(<[^>]*> *)+$", line_, re.UNICODE):
            line_new = line_.strip()
        elif "<" in line_ or ">" in line_:
            continue
        else:
            line_new = line_.strip()

        supervisions.append(
            SupervisionSegment(
                id=_format_uttid(recoid, start),
                recording_id=recoid,
                start=start,
                duration=round(end - start, ndigits=8),
                channel=0,
                text=line_new,
                language=language,
                speaker=recoid,
            ))
    return supervisions
Beispiel #5
0
def deserialize_item(data: dict) -> Any:
    # Figures out what type of manifest is being decoded with some heuristics
    # and returns a Lhotse manifest object rather than a raw dict.
    from lhotse import MonoCut, Features, Recording, SupervisionSegment
    from lhotse.array import deserialize_array
    from lhotse.cut import MixedCut

    if "shape" in data or "array" in data:
        return deserialize_array(data)
    if "sources" in data:
        return Recording.from_dict(data)
    if "num_features" in data:
        return Features.from_dict(data)
    if "type" not in data:
        return SupervisionSegment.from_dict(data)
    cut_type = data.pop("type")
    if cut_type == "MonoCut":
        return MonoCut.from_dict(data)
    if cut_type == "Cut":
        warnings.warn(
            "Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. "
            "Please re-generate it with Lhotse v0.8 as it might stop working in a future version "
            "(using manifest.from_file() and then manifest.to_file() should be sufficient)."
        )
        return MonoCut.from_dict(data)
    if cut_type == "MixedCut":
        return MixedCut.from_dict(data)
    raise ValueError(
        f"Unexpected cut type during deserialization: '{cut_type}'")
Beispiel #6
0
def _process_file(
    file_path: Pathlike,
    speaker_metadata: Dict[str, SpeakerMetadata],
) -> Tuple[Recording, SupervisionSegment]:
    """
    Process a single wav file and return a Recording and a SupervisionSegment.
    """
    speaker_id = file_path.parent.parent.stem
    session_id = file_path.parent.stem
    uttid = file_path.stem
    recording_id = f"{speaker_id}-{session_id}-{uttid}"
    recording = Recording.from_file(file_path, recording_id=recording_id)
    supervision = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        speaker=speaker_id,
        gender=speaker_metadata[speaker_id].gender,
        start=0.0,
        duration=recording.duration,
        custom={
            "speaker_name": speaker_metadata[speaker_id].name,
            "nationality": speaker_metadata[speaker_id].nationality,
            "split": speaker_metadata[speaker_id].split,
        },
    )
    return recording, supervision
Beispiel #7
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(
            id="segment-1",
            recording_id="recording-1",
            channel=0,
            start=0.1,
            duration=0.3,
            text="transcript of the first segment",
            language="english",
            speaker="Norman Dyhrentfurth",
            gender="male",
            alignment={
                "word": [
                    AlignmentItem(symbol="transcript",
                                  start=0.1,
                                  duration=0.08),
                    AlignmentItem(symbol="of", start=0.18, duration=0.02),
                    AlignmentItem(symbol="the", start=0.2, duration=0.03),
                    AlignmentItem(symbol="first", start=0.23, duration=0.07),
                    AlignmentItem(symbol="segment", start=0.3, duration=0.1),
                ]
            },
        )
    ])
Beispiel #8
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav)
        for wav in (corpus_dir / "wav48").rglob("*.wav"))
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Beispiel #9
0
def prepare_tedlium(
    tedlium_root: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ("train", "dev", "test"):
        root = tedlium_root / "legacy" / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / "sph").glob("*.sph"))
        stms = list((root / "stm").glob("*.stm"))
        assert len(stms) == len(recordings), (
            f"Mismatch: found {len(recordings)} "
            f"sphere files and {len(stms)} STM files. "
            f"You might be missing some parts of TEDLIUM...")
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = " ".join(words).replace("{NOISE}", "[NOISE]")
                    if text == "ignore_time_segment_in_scoring":
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f"{rec_id}-{idx}",
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language="English",
                            speaker=rec_id,
                        ))
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_file(output_dir /
                               f"tedlium_recordings_{split}.jsonl.gz")
            supervisions.to_file(output_dir /
                                 f"tedlium_supervisions_{split}.jsonl.gz")

    return corpus
Beispiel #10
0
def cut_set():
    # The contents of 'test/fixtures/libri/cuts.json'
    cs = CutSet.from_dicts([{
        "channel": 0,
        "duration": 10.0,
        "features": {
            "channels": 0,
            "duration": 16.04,
            "num_features": 23,
            "num_frames": 1604,
            "recording_id": "recording-1",
            "sampling_rate": 16000,
            "start": 0.0,
            "storage_path": "test/fixtures/libri/storage",
            "storage_key": "30c2440c-93cb-4e83-b382-f2a59b3859b4.llc",
            "storage_type": "lilcom_files",
            "type": "fbank"
        },
        "id": "e3e70682-c209-4cac-629f-6fbed82c07cd",
        "recording": {
            "duration":
            16.04,
            "id":
            "recording-1",
            "num_samples":
            256640,
            "sampling_rate":
            16000,
            "sources": [{
                "channels": [0],
                "source": "test/fixtures/libri/libri-1088-134315-0000.wav",
                "type": "file"
            }]
        },
        "start": 0.0,
        "supervisions": [],
        "type": "Cut"
    }])
    # These supervisions are artificially overwritten in a 10 seconds long LibriSpeech cut
    # to test the speaker activity matrix in the DiarizationDataset.
    cs[0].supervisions = [
        SupervisionSegment('s1', 'recording-1', 0, 3, speaker='spk1'),
        SupervisionSegment('s2', 'recording-1', 2, 4, speaker='spk2'),
        SupervisionSegment('s3', 'recording-1', 5, 2, speaker='spk3'),
        SupervisionSegment('s4', 'recording-1', 7.5, 2.5, speaker='spk4'),
    ]
    return cs
Beispiel #11
0
def prepare_tedlium(
        tedlium_root: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ('train', 'dev', 'test'):
        root = tedlium_root / 'legacy' / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / 'sph').glob('*.sph')
        )
        stms = list((root / 'stm').glob('*.stm'))
        assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \
                                             f'sphere files and {len(stms)} STM files. ' \
                                             f'You might be missing some parts of TEDLIUM...'
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = ' '.join(words).replace('{NOISE}', '[NOISE]')
                    if text == 'ignore_time_segment_in_scoring':
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f'{rec_id}-{idx}',
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language='English',
                            speaker=rec_id,
                        )
                    )
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_json(output_dir / f'{split}_recordings.json')
            supervisions.to_json(output_dir / f'{split}_supervisions.json')

    return corpus
Beispiel #12
0
def cut_set():
    cut = Cut(id='cut-1',
              start=0.0,
              duration=10.0,
              channel=0,
              features=Features(
                  type='fbank',
                  num_frames=100,
                  num_features=40,
                  frame_shift=0.01,
                  sampling_rate=16000,
                  start=0.0,
                  duration=10.0,
                  storage_type='lilcom',
                  storage_path='irrelevant',
                  storage_key='irrelevant',
              ),
              recording=Recording(id='rec-1',
                                  sampling_rate=16000,
                                  num_samples=160000,
                                  duration=10.0,
                                  sources=[
                                      AudioSource(type='file',
                                                  channels=[0],
                                                  source='irrelevant')
                                  ]),
              supervisions=[
                  SupervisionSegment(id='sup-1',
                                     recording_id='irrelevant',
                                     start=0.5,
                                     duration=6.0),
                  SupervisionSegment(id='sup-2',
                                     recording_id='irrelevant',
                                     start=7.0,
                                     duration=2.0)
              ])
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id='cut-nosup', supervisions=[]),
        fastcopy(cut, id='cut-norec', recording=None),
        fastcopy(cut, id='cut-nofeat', features=None),
        cut.pad(duration=30.0, direction='left'),
        cut.pad(duration=30.0, direction='right'),
        cut.pad(duration=30.0, direction='both'),
        cut.mix(cut, offset_other_by=5.0, snr=8)
    ])
Beispiel #13
0
def test_trim_to_supervisions_mixed_cuts():
    cut_set = CutSet.from_cuts([
        Cut('cut1',
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=5),
                SupervisionSegment('sup3', 'rec1', start=20, duration=8),
            ]).append(
                Cut('cut2',
                    start=0,
                    duration=30,
                    channel=0,
                    supervisions=[
                        SupervisionSegment('sup4',
                                           'rec1',
                                           start=0,
                                           duration=30),
                    ]))
    ])
    assert isinstance(cut_set[0], MixedCut)
    cuts = cut_set.trim_to_supervisions()
    assert len(cuts) == 4
    # After "trimming", the MixedCut "decayed" into simple, unmixed cuts, as they did not overlap
    assert all(isinstance(cut, Cut) for cut in cuts)
    assert all(len(cut.supervisions) == 1 for cut in cuts)
    assert all(cut.supervisions[0].start == 0 for cut in cuts)
    cut = cuts[0]
    # Check that the cuts preserved their start/duration/supervisions after trimming
    assert cut.start == 1.5
    assert cut.duration == 8.5
    assert cut.supervisions[0].id == 'sup1'
    cut = cuts[1]
    assert cut.start == 10
    assert cut.duration == 5
    assert cut.supervisions[0].id == 'sup2'
    cut = cuts[2]
    assert cut.start == 20
    assert cut.duration == 8
    assert cut.supervisions[0].id == 'sup3'
    cut = cuts[3]
    assert cut.start == 0
    assert cut.duration == 30
    assert cut.supervisions[0].id == 'sup4'
Beispiel #14
0
def cut_set():
    cut = MonoCut(
        id="cut-1",
        start=0.0,
        duration=10.0,
        channel=0,
        features=Features(
            type="fbank",
            num_frames=100,
            num_features=40,
            frame_shift=0.01,
            sampling_rate=16000,
            start=0.0,
            duration=10.0,
            storage_type="lilcom",
            storage_path="irrelevant",
            storage_key="irrelevant",
        ),
        recording=Recording(
            id="rec-1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10.0,
            sources=[AudioSource(type="file", channels=[0], source="irrelevant")],
        ),
        supervisions=[
            SupervisionSegment(
                id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0
            ),
            SupervisionSegment(
                id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0
            ),
        ],
    )
    return CutSet.from_cuts(
        [
            cut,
            fastcopy(cut, id="cut-nosup", supervisions=[]),
            fastcopy(cut, id="cut-norec", recording=None),
            fastcopy(cut, id="cut-nofeat", features=None),
            cut.pad(duration=30.0, direction="left"),
            cut.pad(duration=30.0, direction="right"),
            cut.pad(duration=30.0, direction="both"),
            cut.mix(cut, offset_other_by=5.0, snr=8),
        ]
    )
Beispiel #15
0
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None):
    manifests = defaultdict(dict)
    for split in ('dev', 'eval', 'training'):
        audio_dir = corpus_dir / f'conversational/{split}/audio'
        recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph'))
        if len(recordings) == 0:
            logging.warning(f"No SPHERE files found in {audio_dir}")
        manifests[split]['recordings'] = recordings

        supervisions = []
        text_dir = corpus_dir / f'conversational/{split}/transcription'
        for p in text_dir.glob('*'):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_')
            channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A')
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines = p.read_text().splitlines() + [None]
            for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                start = float(timestamp[1:-1])
                end = float(next_timestamp[1:-1])
                supervisions.append(
                    SupervisionSegment(
                        id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}',
                        recording_id=p.stem,
                        start=start,
                        duration=round(end - start, ndigits=8),
                        channel=0,
                        text=normalize_text(text),
                        language=BABELCODE2LANG[lang_code],
                        speaker=speaker,
                    )
                )
        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(
            manifests[split]['recordings'],
            manifests[split]['superevisions']
        )

        if output_dir is not None:
            language = BABELCODE2LANG[lang_code]
            if split == 'training':
                split = 'train'
            manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json')
            manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json')

    return manifests
Beispiel #16
0
def prepare_cmu_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_us_sup_arctic-arctic_a0001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English",
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom={"accent": ACCENT_MAP.get(speaker)},
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_arctic_recordings.json")
        supervisions.to_json(output_dir / "cmu_arctic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Beispiel #17
0
def test_trim_to_unsupervised_segments():
    cut_set = CutSet.from_cuts([
        # Yields 3 unsupervised cuts - before first supervision,
        # between sup2 and sup3, and after sup3.
        MonoCut(
            "cut1",
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment("sup1", "rec1", start=1.5, duration=8.5),
                SupervisionSegment("sup2", "rec1", start=10, duration=5),
                SupervisionSegment("sup3", "rec1", start=20, duration=8),
            ],
            recording=dummy_recording(1, duration=30),
        ),
        # Does not yield any "unsupervised" cut.
        MonoCut(
            "cut2",
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment("sup4", "rec1", start=0, duration=30),
            ],
            recording=dummy_recording(2, duration=30),
        ),
    ])
    unsupervised_cuts = cut_set.trim_to_unsupervised_segments()

    assert len(unsupervised_cuts) == 3

    assert unsupervised_cuts[0].start == 0
    assert unsupervised_cuts[0].duration == 1.5
    assert unsupervised_cuts[0].supervisions == []

    assert unsupervised_cuts[1].start == 15
    assert unsupervised_cuts[1].duration == 5
    assert unsupervised_cuts[1].supervisions == []

    assert unsupervised_cuts[2].start == 28
    assert unsupervised_cuts[2].duration == 2
    assert unsupervised_cuts[2].supervisions == []
Beispiel #18
0
def cut_with_supervision_start01(recording):
    return MonoCut(
        id="cut_start01",
        start=0.1,
        duration=0.4,
        channel=0,
        supervisions=[
            SupervisionSegment(id="sup", recording_id="rec", start=0.1, duration=0.3)
        ],
        recording=recording,
    )
Beispiel #19
0
def cut_with_supervision(recording):
    return MonoCut(
        id="cut",
        start=0.0,
        duration=0.5,
        channel=0,
        supervisions=[
            SupervisionSegment(id="sup", recording_id="rec", start=0.0, duration=0.5)
        ],
        recording=recording,
    )
Beispiel #20
0
def cut_with_supervision(recording):
    return MonoCut(id='cut',
                   start=0.0,
                   duration=0.5,
                   channel=0,
                   supervisions=[
                       SupervisionSegment(id='sup',
                                          recording_id='rec',
                                          start=0.0,
                                          duration=0.5)
                   ],
                   recording=recording)
Beispiel #21
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='segment-1',
                           recording_id='recording-1',
                           channel=0,
                           start=0.1,
                           duration=0.3,
                           text='transcript of the first segment',
                           language='english',
                           speaker='Norman Dyhrentfurth',
                           gender='male')
    ])
Beispiel #22
0
def cut_with_supervision_start01(recording):
    return MonoCut(id='cut_start01',
                   start=0.1,
                   duration=0.4,
                   channel=0,
                   supervisions=[
                       SupervisionSegment(id='sup',
                                          recording_id='rec',
                                          start=0.1,
                                          duration=0.3)
                   ],
                   recording=recording)
def mono_cut():
    """
    Scenario::

        |-----------------Recording-----------------|
           "Hey, Matt!"  "Yes?"
        |--------------| |-----|  "Oh, nothing"
                             |------------------|
        |-------------------Cut1--------------------|
    """
    rec = Recording(id="rec1",
                    duration=10.0,
                    sampling_rate=8000,
                    num_samples=80000,
                    sources=[...])
    sups = [
        SupervisionSegment(id="sup1",
                           recording_id="rec1",
                           start=0.0,
                           duration=3.37,
                           text="Hey, Matt!"),
        SupervisionSegment(id="sup2",
                           recording_id="rec1",
                           start=4.5,
                           duration=0.9,
                           text="Yes?"),
        SupervisionSegment(id="sup3",
                           recording_id="rec1",
                           start=4.9,
                           duration=4.3,
                           text="Oh, nothing"),
    ]
    return MonoCut(
        id="rec1-cut1",
        start=0.0,
        duration=10.0,
        channel=0,
        recording=rec,
        supervisions=sups,
    )
Beispiel #24
0
def test_trim_to_supervisions_mixed_cuts():
    cut_set = CutSet.from_cuts([
        Cut('cut1',
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=5),
                SupervisionSegment('sup3', 'rec1', start=20, duration=8),
            ]).append(
                Cut('cut2',
                    start=0,
                    duration=30,
                    channel=0,
                    supervisions=[
                        SupervisionSegment('sup4',
                                           'rec1',
                                           start=0,
                                           duration=30),
                    ]))
    ])
    cuts = cut_set.trim_to_supervisions()
    assert len(cuts) == 4
    assert all(isinstance(cut, MixedCut) for cut in cuts)
    assert all(cut.start == 0 for cut in cuts)
    assert all(len(cut.supervisions) == 1 for cut in cuts)
    assert all(cut.supervisions[0].start == 0 for cut in cuts)
    cut = cuts[0]
    assert cut.duration == 8.5
    assert cut.supervisions[0].id == 'sup1'
    cut = cuts[1]
    assert cut.duration == 5
    assert cut.supervisions[0].id == 'sup2'
    cut = cuts[2]
    assert cut.duration == 8
    assert cut.supervisions[0].id == 'sup3'
    cut = cuts[3]
    assert cut.duration == 30
    assert cut.supervisions[0].id == 'sup4'
def cut(recording):
    return MonoCut(
        id="cut",
        start=0,
        duration=1.0,
        channel=0,
        recording=recording,
        supervisions=[
            SupervisionSegment(id="sup",
                               recording_id=recording.id,
                               start=0,
                               duration=0.5)
        ],
    )
Beispiel #26
0
def test_trim_to_unsupervised_segments():
    cut_set = CutSet.from_cuts([
        # Yields 3 unsupervised cuts - before first supervision,
        # between sup2 and sup3, and after sup3.
        Cut('cut1',
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=5),
                SupervisionSegment('sup3', 'rec1', start=20, duration=8),
            ]),
        # Does not yield any "unsupervised" cut.
        Cut('cut2',
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment('sup4', 'rec1', start=0, duration=30),
            ]),
    ])
    unsupervised_cuts = cut_set.trim_to_unsupervised_segments()

    assert len(unsupervised_cuts) == 3

    assert unsupervised_cuts[0].start == 0
    assert unsupervised_cuts[0].duration == 1.5
    assert unsupervised_cuts[0].supervisions == []

    assert unsupervised_cuts[1].start == 15
    assert unsupervised_cuts[1].duration == 5
    assert unsupervised_cuts[1].supervisions == []

    assert unsupervised_cuts[2].start == 28
    assert unsupervised_cuts[2].duration == 2
    assert unsupervised_cuts[2].supervisions == []
Beispiel #27
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid

    recordings = []
    supervisions = []
    for audio_path in tqdm(
        (part3_path / "AudioSameCloseMic").glob("*.wav"),
        desc="Creating manifests for SameCloseMic",
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_file(audio_path)

            tg = TextGrid(
                part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16"
            )
            segments = [
                s
                for s in (
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(
                            round(segment.xmax - segment.xmin, ndigits=8),
                            recording.duration - segment.xmin,
                        ),
                        text=segment.text,
                        language="Singaporean English",
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ("<S>", "<Z>")  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f"Error when processing {audio_path} - skipping...")
    return {
        "recordings": RecordingSet.from_recordings(recordings),
        "supervisions": SupervisionSet.from_segments(supervisions),
    }
Beispiel #28
0
def libri_cut_with_supervision(libri_recording_orig):
    return MonoCut(
        id="libri_cut_1",
        start=0,
        duration=libri_recording_orig.duration,
        channel=0,
        supervisions=[
            SupervisionSegment(
                id="sup",
                recording_id="rec",
                start=0,
                duration=libri_recording_orig.duration,
            )
        ],
        recording=libri_recording_orig,
    )
Beispiel #29
0
def deserialize_item(data: dict) -> Any:
    # Figures out what type of manifest is being decoded with some heuristics
    # and returns a Lhotse manifest object rather than a raw dict.
    from lhotse import Cut, Features, Recording, SupervisionSegment
    from lhotse.cut import MixedCut
    data = arr2list_recursive(data)
    if 'sources' in data:
        return Recording.from_dict(data)
    if 'num_features' in data:
        return Features.from_dict(data)
    if 'type' not in data:
        return SupervisionSegment.from_dict(data)
    cut_type = data.pop('type')
    if cut_type == 'Cut':
        return Cut.from_dict(data)
    if cut_type == 'MixedCut':
        return MixedCut.from_dict(data)
    raise ValueError(f"Unexpected cut type during deserialization: '{cut_type}'")
Beispiel #30
0
 def with_cut(
     self,
     sampling_rate: int,
     num_samples: int,
     features: bool = True,
     supervision: bool = False,
     alignment: bool = False,
     custom_field: bool = False,
     frame_shift: Seconds = 0.01,
     use_zeroes: bool = False,
 ) -> MonoCut:
     duration = num_samples / sampling_rate
     cut = MonoCut(
         id=str(uuid4()),
         start=0,
         duration=duration,
         channel=0,
         recording=self.with_recording(
             sampling_rate=sampling_rate,
             num_samples=num_samples,
             use_zeros=use_zeroes,
         ),
     )
     if features:
         cut = self._with_features(
             cut, frame_shift=frame_shift, sampling_rate=sampling_rate
         )
     if supervision:
         cut.supervisions.append(
             SupervisionSegment(
                 id=f"sup-{cut.id}",
                 recording_id=cut.recording_id,
                 start=0,
                 duration=cut.duration,
                 text="irrelevant",
                 alignment=self._with_alignment(cut, "irrelevant")
                 if alignment
                 else None,
             )
         )
     if custom_field:
         self._with_custom_temporal_array(cut=cut, frame_shift=frame_shift)
     return cut