Example #1
0
def dummy_supervision_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        sups = SupervisionSet.from_segments(
            [
                SupervisionSegment(
                    id="sup1",
                    recording_id="rec1",
                    start=3,
                    duration=4,
                    channel=0,
                    text="dummy text",
                ),
                SupervisionSegment(
                    id="sup2",
                    recording_id="rec1",
                    start=7,
                    duration=2,
                    channel=0,
                    text="dummy text",
                ),
            ]
        )
        sups.to_file(f.name)
        f.flush()
        yield SupervisionSet.from_jsonl_lazy(f.name)
Example #2
0
def trim_supervisions_to_recordings(
        recordings: RecordingSet,
        supervisions: SupervisionSet) -> SupervisionSet:
    """
    Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are
    not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`.
    """
    if recordings.is_lazy:
        recordings = RecordingSet.from_recordings(iter(recordings))

    sups = []
    removed = 0
    trimmed = 0
    for s in supervisions:
        end = recordings[s.recording_id].duration
        if s.start > end:
            removed += 1
            continue
        if s.end > end:
            trimmed += 1
            s = s.trim(recordings[s.recording_id].duration)
        sups.append(s)
    if removed:
        logging.warning(
            f"Removed {removed} supervisions starting after the end of the recording."
        )
    if trimmed:
        logging.warning(
            f"Trimmed {trimmed} supervisions exceeding the end of the recording."
        )
    return SupervisionSet.from_segments(sups)
Example #3
0
def search_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='s1', recording_id='r1', start=0, duration=5.0, channel=0),
        SupervisionSegment(id='s2', recording_id='r1', start=4.5, duration=2.0, channel=1),
        SupervisionSegment(id='s3', recording_id='r1', start=8.0, duration=3.0, channel=0),
        SupervisionSegment(id='s4', recording_id='r2', start=1, duration=5.0, channel=0),
    ])
Example #4
0
def prepare_broadcast_news(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    """
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    sgml_paths = check_and_rglob(transcripts_dir, '*.sgml')

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(
            p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths)

    # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation;
    # on some systems the recursion limit needs to be raised for this to work.
    with recursion_limit(5000):
        supervisions_list = [
            make_supervisions(p, r) for p, r in zip(sgml_paths, recordings)
        ]
    section_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups['sections'] for sups in supervisions_list))
    segment_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups['segments'] for sups in supervisions_list))

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        section_supervisions.to_json(output_dir / 'sections.json')
        segment_supervisions.to_json(output_dir / 'segments.json')

    return {
        'recordings': recordings,
        'sections': section_supervisions,
        'segments': segment_supervisions
    }
Example #5
0
def prepare_switchboard(
    audio_dir: Pathlike,
    transcripts_dir: Optional[Pathlike] = None,
    sentiment_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    omit_silence: bool = True,
    absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({
            'audio': ap,
            'text-0': name_to_text[f'{name}A'],
            'text-1': name_to_text[f'{name}B']
        })

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(
            group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups)
    supervisions = SupervisionSet.from_segments(
        chain.from_iterable(
            make_segments(transcript_path=group[f'text-{channel}'],
                          recording=recording,
                          channel=channel,
                          omit_silence=omit_silence)
            for group, recording in zip(groups, recordings)
            for channel in [0, 1]))

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {'recordings': recordings, 'supervisions': supervisions}
Example #6
0
def dummy_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='sup1',
                           recording_id='rec1',
                           start=3,
                           duration=4,
                           channel=0,
                           text='dummy text')
    ])
Example #7
0
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest:
    if type_ == RecordingSet:
        return RecordingSet.from_recordings(
            dummy_recording(idx) for idx in range(begin_id, end_id))
    if type_ == SupervisionSet:
        return SupervisionSet.from_segments(
            dummy_supervision(idx) for idx in range(begin_id, end_id))
    if type_ == FeatureSet:
        # noinspection PyTypeChecker
        return FeatureSet.from_features(
            dummy_features(idx) for idx in range(begin_id, end_id))
Example #8
0
def dummy_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(
            id="sup1",
            recording_id="rec1",
            start=3,
            duration=4,
            channel=0,
            text="dummy text",
        )
    ])
Example #9
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / "metadata.csv"
    assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}"
    recordings = []
    supervisions = []
    with open(metadata_csv_path) as f:
        for line in f:
            recording_id, text, _ = line.split("|")
            audio_path = corpus_dir / "wavs" / f"{recording_id}.wav"
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            segment = SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="English",
                gender="female",
                text=text,
            )
            recordings.append(recording)
            supervisions.append(segment)

    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recording_set, supervision_set)

    if output_dir is not None:
        supervision_set.to_json(output_dir / "supervisions.json")
        recording_set.to_json(output_dir / "recordings.json")

    return {"recordings": recording_set, "supervisions": supervision_set}
Example #10
0
def prepare_yesno(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
        are 8 x's and each x is either 1 or 0.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is either "train" or "test", and the value is
        Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    wave_files = list(corpus_dir.glob("*.wav"))
    assert len(wave_files) == 60

    wave_files.sort()
    train_set = wave_files[::2]
    test_set = wave_files[1::2]

    assert len(train_set) == 30
    assert len(test_set) == 30

    manifests = defaultdict(dict)
    for name, dataset in zip(["train", "test"], [train_set, test_set]):
        recordings, supervisions = _prepare_dataset(dataset)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{name}.json")
            recording_set.to_json(output_dir / f"recordings_{name}.json")

        manifests[name] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Example #11
0
def test_known_issue_with_overlap():
    r = dummy_recording(0)
    rec = RecordingSet.from_recordings([r])

    # Make two segments. The first segment is 1s long. The segment segment
    # is 0.3 seconds long and lies entirely within the first. Both have the
    # same recording_id as the single entry in rec.
    sup = SupervisionSet.from_segments(
        [
            SupervisionSegment(
                id="utt1",
                recording_id=r.id,
                start=0.0,
                duration=1.0,
                channel=0,
                text="Hello",
            ),
            SupervisionSegment(
                id="utt2",
                recording_id=r.id,
                start=0.2,
                duration=0.5,
                channel=0,
                text="World",
            ),
        ]
    )

    cuts = CutSet.from_manifests(recordings=rec, supervisions=sup)
    assert len(cuts) == 1

    cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False)
    assert len(cuts_trim) == 2

    cut = cuts_trim[0]
    assert cut.start == 0
    assert cut.duration == 1
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 1
    assert sup.text == "Hello"

    cut = cuts_trim[1]
    assert cut.start == 0.2
    assert cut.duration == 0.5
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 0.5
    assert sup.text == "World"
Example #12
0
def make_corresponding_supervisions(audio: RecordingSet) -> SupervisionSet:
    """
    Prepare a supervision set - in this case it just describes
    which segments are available in the corpus, as the actual supervisions for
    speech separation come from the source recordings.
    """
    return SupervisionSet.from_segments(
        SupervisionSegment(
            id=f"{recording.id}-c{source.channels[0]}",
            recording_id=recording.id,
            start=0.0,
            duration=recording.duration,
            channel=source.channels[0],
        ) for recording in audio for source in recording.sources)
Example #13
0
def prepare_gigaspeech(
        gigaspeech: Any,
        dataset_parts: Union[str, Sequence[str]] = 'auto',
        output_dir: Optional[Pathlike] = None,
        num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available('speechcolab'):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')

    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl')
        if maybe_manifests is not None:
            return maybe_manifests

    manifests = defaultdict(dict)
    with ThreadPoolExecutor(num_jobs) as ex:
        for part in subsets:
            futures = []
            for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False):
                futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path))

            recordings = []
            supervisions = []
            for future in tqdm(futures, desc='Processing', leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segments = result
                recordings.append(recording)
                supervisions += segments

            manifests[part] = {
                'recordings': RecordingSet.from_recordings(recordings),
                'supervisions': SupervisionSet.from_segments(supervisions)
            }

            if output_dir is not None:
                manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl')
                manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl')

    return dict(manifests)
Example #14
0
def prepare_supervision_ihm(
    audio: RecordingSet,
    annotations: Dict[str, List[IcsiSegmentAnnotation]],
    channel_to_idx_map: Dict[str, Dict[str, int]],
) -> SupervisionSet:
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {
        (key[0], channel_to_idx_map[key[0]][key[2]]): annotations[key]
        for key in annotations
    }

    segments = []
    for recording in tqdm(audio, desc="Preparing supervision"):
        # IHM can have multiple audio sources for each recording
        for source in recording.sources:
            # For each source, "channels" will always be a one-element list
            (channel, ) = source.channels
            annotation = annotation_by_id_and_channel.get(
                (recording.id, channel))

            if annotation is None:
                continue

            for seg_idx, seg_info in enumerate(annotation):
                duration = seg_info.end_time - seg_info.start_time
                # Some annotations in IHM setting exceed audio duration, so we
                # ignore such segments
                if seg_info.end_time > recording.duration:
                    logging.warning(
                        f"Segment {recording.id}-{channel}-{seg_idx} exceeds "
                        f"recording duration. Not adding to supervisions.")
                    continue
                if duration > 0:
                    segments.append(
                        SupervisionSegment(
                            id=f"{recording.id}-{channel}-{seg_idx}",
                            recording_id=recording.id,
                            start=seg_info.start_time,
                            duration=duration,
                            channel=channel,
                            language="English",
                            speaker=seg_info.speaker,
                            gender=seg_info.gender,
                            text=seg_info.text,
                        ))

    return SupervisionSet.from_segments(segments)
Example #15
0
def prepare_supervision_ihm(
        audio: RecordingSet,
        annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet:
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {(key[0], key[2]): annotations[key]
                                    for key in annotations}

    segments = []
    for recording in audio:
        # AMI IHM can have multiple audio sources for each recording
        for source in recording.sources:
            # For each source, "channels" will always be a one-element list
            channel, = source.channels
            annotation = annotation_by_id_and_channel.get(
                (recording.id, channel))
            if annotation is None:
                logging.warning(
                    f'No annotation found for recording {recording.id} '
                    f'(file {source.source})')
                continue

            for seg_idx, seg_info in enumerate(annotation):
                duration = seg_info.end_time - seg_info.begin_time
                # Some annotations in IHM setting exceed audio duration, so we
                # ignore such segments
                if seg_info.end_time > recording.duration:
                    logging.warning(
                        f'Segment {recording.id}-{channel}-{seg_idx} exceeds '
                        f'recording duration. Not adding to supervisions.')
                    continue
                if duration > 0:
                    segments.append(
                        SupervisionSegment(
                            id=f'{recording.id}-{channel}-{seg_idx}',
                            recording_id=recording.id,
                            start=seg_info.begin_time,
                            duration=duration,
                            channel=channel,
                            language='English',
                            speaker=seg_info.speaker,
                            gender=seg_info.gender,
                            text=seg_info.text))

    return SupervisionSet.from_segments(segments)
Example #16
0
def test_supervision_set_serialization():
    supervision_set = SupervisionSet.from_segments([
        SupervisionSegment(
            id='segment-1',
            recording_id='recording-1',
            channel_id=0,
            start=0.1,
            duration=0.3,
            text='transcript of the first segment',
            language='english',
            speaker='Norman Dyhrentfurth',
            gender='male'
        )
    ])
    with NamedTemporaryFile() as f:
        supervision_set.to_yaml(f.name)
        restored = supervision_set.from_yaml(f.name)
    assert supervision_set == restored
Example #17
0
def validate_recordings_and_supervisions(
    recordings: Union[RecordingSet, Recording],
    supervisions: Union[SupervisionSet, SupervisionSegment],
    read_data: bool = False,
) -> None:
    """
    Validate the recording and supervision manifests separately,
    and then check if they are consistent with each other.

    This method will emit warnings, instead of errors, when some recordings or supervisions
    are missing their counterparts.
    These items will be discarded by default when creating a CutSet.
    """
    if isinstance(recordings, Recording):
        recordings = RecordingSet.from_recordings([recordings])
    if isinstance(supervisions, SupervisionSegment):
        supervisions = SupervisionSet.from_segments([supervisions])
    validate(recordings, read_data=read_data)
    validate(supervisions)
    # Errors
    for s in supervisions:
        r = recordings[s.recording_id]
        assert -1e-3 <= s.start <= s.end <= r.duration + 1e-3, (
            f"Supervision {s.id}: exceeded the bounds of its corresponding recording "
            f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])"
        )
        assert s.channel in r.channel_ids, (
            f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording "
            f"(recording channels: {r.channel_ids})")
    # Warnings
    recording_ids = frozenset(r.id for r in recordings)
    recording_ids_in_sups = frozenset(s.recording_id for s in supervisions)
    only_in_recordings = recording_ids - recording_ids_in_sups
    if only_in_recordings:
        logging.warning(
            f"There are {len(only_in_recordings)} recordings that "
            f"do not have any corresponding supervisions in the SupervisionSet."
        )
    only_in_supervisions = recording_ids_in_sups - recording_ids
    if only_in_supervisions:
        logging.warning(
            f"There are {len(only_in_supervisions)} supervisions that "
            f"are missing their corresponding recordings in the RecordingSet.")
Example #18
0
def test_supervision_set_serialization(format, compressed):
    supervision_set = SupervisionSet.from_segments([
        SupervisionSegment(id='segment-1',
                           recording_id='recording-1',
                           channel=0,
                           start=0.1,
                           duration=0.3,
                           text='transcript of the first segment',
                           language='english',
                           speaker='Norman Dyhrentfurth',
                           gender='male')
    ])
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'yaml':
            supervision_set.to_yaml(f.name)
            restored = supervision_set.from_yaml(f.name)
        if format == 'json':
            supervision_set.to_json(f.name)
            restored = supervision_set.from_json(f.name)
    assert supervision_set == restored
Example #19
0
 def __init__(
     self,
     cuts: CutSet,
     uem: Optional[SupervisionSet] = None,
     min_speaker_dim: Optional[int] = None,
     global_speaker_ids: bool = False,
 ) -> None:
     super().__init__()
     validate(cuts)
     if not uem:
         self.cuts = cuts
     else:
         # We use the `overlap` method in intervaltree to get overlapping regions
         # between the supervision segments and the UEM segments
         recordings = RecordingSet(
             {c.recording.id: c.recording
              for c in cuts if c.has_recording})
         uem_intervals = CutSet.from_manifests(
             recordings=recordings,
             supervisions=uem,
         ).index_supervisions()
         supervisions = []
         for cut_id, tree in cuts.index_supervisions().items():
             if cut_id not in uem_intervals:
                 supervisions += [it.data for it in tree]
                 continue
             supervisions += {
                 it.data.trim(it.end, start=it.begin)
                 for uem_it in uem_intervals[cut_id]
                 for it in tree.overlap(begin=uem_it.begin, end=uem_it.end)
             }
         self.cuts = CutSet.from_manifests(
             recordings=recordings,
             supervisions=SupervisionSet.from_segments(supervisions),
         )
     self.speakers = ({
         spk: idx
         for idx, spk in enumerate(self.cuts.speakers)
     } if global_speaker_ids else None)
     self.min_speaker_dim = min_speaker_dim
Example #20
0
def prepare_supervision_other(
        audio: RecordingSet,
        annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet:
    annotation_by_id = defaultdict(list)
    for key, value in annotations.items():
        annotation_by_id[key[0]].extend(value)

    segments = []
    for recording in tqdm(audio, desc="Preparing supervisions"):
        annotation = annotation_by_id.get(recording.id)
        # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm)
        # will share supervision.
        if annotation is None:
            logging.warning(
                f"No annotation found for recording {recording.id}")
            continue

        if any(len(source.channels) > 1 for source in recording.sources):
            logging.warning(
                f"More than 1 channels in recording {recording.id}. "
                f"Skipping this recording.")
            continue

        for seg_idx, seg_info in enumerate(annotation):
            duration = seg_info.end_time - seg_info.start_time
            if duration > 0:
                segments.append(
                    SupervisionSegment(
                        id=f"{recording.id}-{seg_idx}",
                        recording_id=recording.id,
                        start=seg_info.start_time,
                        duration=duration,
                        channel=0,
                        language="English",
                        speaker=seg_info.speaker,
                        gender=seg_info.gender,
                        text=seg_info.text,
                    ))
    return SupervisionSet.from_segments(segments)
Example #21
0
def prepare_supervision_other(
    audio: RecordingSet,
    annotations: Dict[str, List[AmiSegmentAnnotation]]
) -> SupervisionSet:
    annotation_by_id = {
        (key[0]): annot
        for key, annot in  annotations.items()
    }

    segments = []
    for recording in audio:
        annotation = annotation_by_id.get(recording.id)
        # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm)
        # will share supervision.
        source = recording.sources[0]
        if annotation is None:
            logging.warning(f'No annotation found for recording {recording.id}')
            continue
        
        if (len(source.channels) > 1):
            logging.warning(f'More than 1 channels in recording {recording.id}. '
                            f'Creating supervision for channel 0 only.')

        for seg_idx, seg_info in enumerate(annotation):
            duration = seg_info.end_time - seg_info.begin_time
            if duration > 0:
                segments.append(SupervisionSegment(
                    id=f'{recording.id}-{seg_idx}',
                    recording_id=recording.id,
                    start=seg_info.begin_time,
                    duration=duration,
                    channel=0,
                    language='English',
                    speaker=seg_info.speaker,
                    gender=seg_info.gender,
                    text=seg_info.text
                ))
    return SupervisionSet.from_segments(segments)
Example #22
0
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]:
    """
    Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the
    corresponding type. When the iterable is empty, returns None.
    """
    items = iter(items)
    try:
        first_item = next(items)
    except StopIteration:
        return None
    items = chain([first_item], items)

    if isinstance(first_item, Recording):
        return RecordingSet.from_recordings(items)
    if isinstance(first_item, SupervisionSegment):
        return SupervisionSet.from_segments(items)
    if isinstance(first_item, (Cut, MixedCut)):
        return CutSet.from_cuts(items)
    if isinstance(first_item, Features):
        raise ValueError("FeatureSet generic construction from iterable is not possible, as the config information "
                         "would have been lost. Call FeatureSet.from_features() directly instead.")

    raise ValueError(f"Unknown type of manifest item: {first_item}")
Example #23
0
def search_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id="s1",
                           recording_id="r1",
                           start=0,
                           duration=5.0,
                           channel=0),
        SupervisionSegment(id="s2",
                           recording_id="r1",
                           start=4.5,
                           duration=2.0,
                           channel=1),
        SupervisionSegment(id="s3",
                           recording_id="r1",
                           start=8.0,
                           duration=3.0,
                           channel=0),
        SupervisionSegment(id="s4",
                           recording_id="r2",
                           start=1,
                           duration=5.0,
                           channel=0),
    ])
Example #24
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Example #25
0
def prepare_librispeech(
        corpus_dir: Pathlike,
        dataset_parts: Optional[Tuple[str]] = dataset_parts_mini,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, text)
        metadata = {}
        part_path = corpus_dir / part
        for trans_path in part_path.rglob('*.txt'):
            with open(trans_path) as f:
                for line in f:
                    idx, text = line.split(maxsplit=1)
                    audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac'
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text)
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(
                id=idx,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[0],
                        source=str(metadata[idx].audio_path)
                    )
                ],
                sampling_rate=int(metadata[idx].audio_info.rate),
                num_samples=metadata[idx].audio_info.length,
                duration=metadata[idx].audio_info.length / metadata[idx].audio_info.rate
            )
            for idx in metadata
        )

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=audio.recordings[idx].duration,
                channel=0,
                language='English',
                speaker=re.sub(r'-.*', r'', idx),
                text=metadata[idx].text.strip()
            )
            for idx in audio.recordings
        )

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Example #26
0
def prepare_wenet_speech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "all",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: Which parts of dataset to prepare, all for all the
                          parts.
    :param output_dir: Pathlike, the path where to write the manifests.
    :num_jobs Number of workers to extract manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with
             the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts

    manifests = defaultdict(dict)
    for sub in subsets:
        if sub not in WETNET_SPEECH_PARTS:
            raise ValueError(f"No such part of dataset in WenetSpeech : {sub}")
        manifests[sub] = {"recordings": [], "supervisions": []}

    raw_manifests_path = corpus_dir / "WenetSpeech.json"
    assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}"
    logging.info(f"Loading raw manifests from : {raw_manifests_path}")
    raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8"))

    with ProcessPoolExecutor(num_jobs) as ex:
        for recording, segments in tqdm(
                ex.map(
                    parse_utterance,
                    raw_manifests["audios"],
                    repeat(corpus_dir),
                    repeat(subsets),
                ),
                desc="Processing WenetSpeech JSON entries",
        ):
            for part in segments:
                manifests[part]["recordings"].append(recording)
                manifests[part]["supervisions"].extend(segments[part])

    for sub in subsets:
        recordings, supervisions = fix_manifests(
            recordings=RecordingSet.from_recordings(
                manifests[sub]["recordings"]),
            supervisions=SupervisionSet.from_segments(
                manifests[sub]["supervisions"]),
        )
        validate_recordings_and_supervisions(recordings=recordings,
                                             supervisions=supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir / f"supervisions_{sub}.jsonl.gz")
            recordings.to_file(output_dir / f"recordings_{sub}.jsonl.gz")

        manifests[sub] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }

    return manifests
Example #27
0
def prepare_cmu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CMU Kids corpus. The prepared supervisions contain the
    prompt text as the `text`. Additionally, in the `custom` tag, we provide the
    following data: speaker grade/age, population where the speaker came from
    (SIM95/FP), spoken transcript, and transcription bin (1/2).

    Here, bin `1` means utterances where the speaker followed the prompt and no
    noise/mispronunciation is present, and `2` refers to noisy utterances.

    The tag `spoken_transcript` is the transcription that was actually spoken. It
    contains noise tags and phone transcription in case the pronunciation differed
    from that in CMU Dict.

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir,
                                                str) else corpus_dir
    corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir

    recordings = []
    supervisions = []

    # Get transcripts for all utterances
    utterances = {}
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'sentence.tbl', 'r') as f:
        for line in f:
            utt, count, text = line.strip().split('\t')
            utterances[utt] = text

    # Get speaker metadata
    speaker_info = {}
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'speaker.tbl', 'r') as f:
        for _ in range(2):
            next(f)
        for line in f:
            # ID    LOC     GR/AGE  TOT     BIN2
            # fabm    SUM95   3/9     100     62
            # facs    SUM95   2/8     90      55
            spk, pop, gr_age, _, _ = line.strip().split('\t')
            grade, age = gr_age.split('/')
            speaker_info[spk] = (pop, grade, age)

    # Iterate through all transcriptions and add to supervisions
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'transcrp.tbl', 'r') as f:
        for line in f:
            trn_id, transcript = line.strip().split(maxsplit=1)
            spk = trn_id[0:4]
            utt = trn_id[4:7]
            bin = int(trn_id[7])
            pop, grade, age = speaker_info[spk]

            audio_path = (corpus_dir / 'cmu_kids' / 'kids' / spk / 'signal' /
                          f'{trn_id}.sph')
            recording = Recording.from_file(
                audio_path, relative_path_depth=None if absolute_paths else 3)
            recordings.append(recording)

            supervisions.append(
                SupervisionSegment(
                    id=trn_id,
                    recording_id=trn_id,
                    start=0,
                    duration=recording.duration,
                    speaker=spk,
                    gender="Male" if spk[0] == 'm' else "Female",
                    language='English',
                    text=utterances[utt],
                    custom={
                        'speaker_grade': grade if grade != "NA" else None,
                        'speaker_age': int(age) if age != "NA" else None,
                        'speaker_population': pop,
                        'bin': bin,
                        'spoken_transcript': transcript,
                    },
                ))

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        'recordings': recordings,
        'supervisions': supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_json(output_dir / 'recordings.json')
        manifests["supervisions"].to_json(output_dir / 'supervisions.json')

    return manifests
Example #28
0
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "mini_librispeech":
        dataset_parts = set(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*"))
    elif dataset_parts == "auto":
        dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*")))
        if not dataset_parts:
            raise ValueError(
                f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}"
            )
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in tqdm(dataset_parts, desc="Dataset parts"):
            logging.info(f"Processing LibriSpeech subset: {part}")
            if manifests_exist(part=part, output_dir=output_dir):
                logging.info(
                    f"LibriSpeech subset: {part} already prepared - skipping.")
                continue
            recordings = []
            supervisions = []
            part_path = corpus_dir / part
            futures = []
            for trans_path in tqdm(part_path.rglob("*.trans.txt"),
                                   desc="Distributing tasks",
                                   leave=False):
                alignments = {}
                ali_path = trans_path.parent / (trans_path.stem.split(".")[0] +
                                                ".alignment.txt")
                if ali_path.exists():
                    alignments = parse_alignments(ali_path)
                # "trans_path" file contains lines like:
                #
                #   121-121726-0000 ALSO A POPULAR CONTRIVANCE
                #   121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE
                #   121-121726-0002 ANGOR PAIN PAINFUL TO HEAR
                #
                # We will create a separate Recording and SupervisionSegment for those.
                with open(trans_path) as f:
                    for line in f:
                        futures.append(
                            ex.submit(parse_utterance, part_path, line,
                                      alignments))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segment = result
                recordings.append(recording)
                supervisions.append(segment)

            recording_set = RecordingSet.from_recordings(recordings)
            supervision_set = SupervisionSet.from_segments(supervisions)

            validate_recordings_and_supervisions(recording_set,
                                                 supervision_set)

            if output_dir is not None:
                supervision_set.to_file(output_dir /
                                        f"supervisions_{part}.json")
                recording_set.to_file(output_dir / f"recordings_{part}.json")

            manifests[part] = {
                "recordings": recording_set,
                "supervisions": supervision_set,
            }

    return manifests
Example #29
0
def prepare_dihard3(
    dev_audio_dir: Pathlike,
    eval_audio_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    uem_manifest: Optional[bool] = True,
    num_jobs: Optional[int] = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the DIHARD III corpus.
    We create two manifests: one with recordings, and the other one with supervisions containing speaker id
    and timestamps.

    :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g.
        /data/corpora/LDC/LDC2020E12
    :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g.
        /data/corpora/LDC/LDC2021E02`
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in
        dataset.DiarizationDataset)
    :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    manifests = defaultdict(dict)
    for part in tqdm(["dev", "eval"], desc="Preparing DIHARD parts"):
        audio_dir = dev_audio_dir if part == "dev" else eval_audio_dir
        if audio_dir is None or not Path(audio_dir).exists():
            logging.warning(f"Nothing to be done for {part}")
            continue
        rttm_paths = list(check_and_rglob(audio_dir, "*.rttm"))
        uem_paths = list(check_and_rglob(audio_dir, "*.uem"))

        recordings = RecordingSet.from_dir(audio_dir,
                                           "*.flac",
                                           num_jobs=num_jobs)

        # Read metadata for recordings
        metadata = parse_metadata(
            list(check_and_rglob(audio_dir, "recordings.tbl"))[0])

        supervisions = SupervisionSet.from_segments(
            chain.from_iterable(
                make_rttm_segments(
                    rttm_path=[
                        x for x in rttm_paths if x.stem == recording.id
                    ][0],
                    recording=recording,
                    metadata=metadata[recording.id],
                ) for recording in recordings))
        if uem_manifest:
            uem = SupervisionSet.from_segments(
                chain.from_iterable(
                    make_uem_segments(
                        uem_path=[
                            x for x in uem_paths if x.stem == recording.id
                        ][0],
                        recording=recording,
                    ) for recording in recordings))

        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{part}.json")
            supervisions.to_json(output_dir / f"supervisions_{part}.json")
            if uem_manifest:
                uem.to_json(output_dir / f"uem_{part}.json")
        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions
        }
        if uem_manifest:
            manifests[part].update({"uem": uem})
    return manifests
Example #30
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt'
    transcript_dict = {}
    with open(transcript_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ['train', 'dev', 'test']
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}'
        for audio_path in wav_path.rglob('**/*.wav'):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f'No transcript: {idx}')
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f'No such file: {audio_path}')
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(id=idx,
                                         recording_id=idx,
                                         start=0.0,
                                         duration=recording.duration,
                                         channel=0,
                                         language='Chinese',
                                         speaker=speaker,
                                         text=text.strip())
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests