Exemple #1
0
def dummy_supervision_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        sups = SupervisionSet.from_segments(
            [
                SupervisionSegment(
                    id="sup1",
                    recording_id="rec1",
                    start=3,
                    duration=4,
                    channel=0,
                    text="dummy text",
                ),
                SupervisionSegment(
                    id="sup2",
                    recording_id="rec1",
                    start=7,
                    duration=2,
                    channel=0,
                    text="dummy text",
                ),
            ]
        )
        sups.to_file(f.name)
        f.flush()
        yield SupervisionSet.from_jsonl_lazy(f.name)
Exemple #2
0
def prepare_single_commonvoice_tsv(
    lang: str,
    part: str,
    output_dir: Pathlike,
    lang_path: Pathlike,
) -> Tuple[RecordingSet, SupervisionSet]:
    """
    Prepares part of CommonVoice data from a single TSV file.

    :param lang: string language code (e.g., "en").
    :param part: which split to prepare (e.g., "train", "validated", etc.).
    :param output_dir: path to directory where we will store the manifests.
    :param lang_path: path to a CommonVoice directory for a specific language
        (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl").
    :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode,
        as CommonVoice manifests may be fairly large in memory.
    """
    if not is_module_available("pandas"):
        raise ValueError(
            "To prepare CommonVoice data, please 'pip install pandas' first.")
    import pandas as pd

    lang_path = Path(lang_path)
    output_dir = Path(output_dir)
    tsv_path = lang_path / f"{part}.tsv"

    # Read the metadata
    df = pd.read_csv(tsv_path, sep="\t")
    # Scan all the audio files
    with RecordingSet.open_writer(
            output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as recs_writer, SupervisionSet.open_writer(
            output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as sups_writer:
        for idx, row in tqdm(
                df.iterrows(),
                desc="Processing audio files",
                total=len(df),
        ):
            try:
                result = parse_utterance(row, lang_path, lang)
                if result is None:
                    continue
                recording, segment = result
                validate_recordings_and_supervisions(recording, segment)
                recs_writer.write(recording)
                sups_writer.write(segment)
            except Exception as e:
                logging.error(
                    f"Error when processing TSV file: line no. {idx}: '{row}'.\n"
                    f"Original error type: '{type(e)}' and message: {e}")
                continue
    recordings = RecordingSet.from_jsonl_lazy(recs_writer.path)
    supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path)
    return recordings, supervisions
def prepare_broadcast_news(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    """
    audio_paths = check_and_rglob(audio_dir, "*.sph")
    sgml_paths = check_and_rglob(transcripts_dir, "*.sgml")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths
    )

    # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation;
    # on some systems the recursion limit needs to be raised for this to work.
    with recursion_limit(5000):
        supervisions_list = [
            make_supervisions(p, r) for p, r in zip(sgml_paths, recordings)
        ]
    section_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["sections"] for sups in supervisions_list)
    )
    segment_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["segments"] for sups in supervisions_list)
    )

    validate_recordings_and_supervisions(recordings, segment_supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz")
        section_supervisions.to_file(
            output_dir / "broadcast-news_sections_all.jsonl.gz"
        )
        segment_supervisions.to_file(
            output_dir / "broadcast-news_segments_all.jsonl.gz"
        )

    return {
        "recordings": recordings,
        "sections": section_supervisions,
        "segments": segment_supervisions,
    }
Exemple #4
0
def validate_recordings_and_supervisions(
    recordings: Union[RecordingSet, Recording],
    supervisions: Union[SupervisionSet, SupervisionSegment],
    read_data: bool = False,
) -> None:
    """
    Validate the recording and supervision manifests separately,
    and then check if they are consistent with each other.

    This method will emit warnings, instead of errors, when some recordings or supervisions
    are missing their counterparts.
    These items will be discarded by default when creating a CutSet.
    """
    if isinstance(recordings, Recording):
        recordings = RecordingSet.from_recordings([recordings])
    if isinstance(supervisions, SupervisionSegment):
        supervisions = SupervisionSet.from_segments([supervisions])

    if recordings.is_lazy:
        recordings = RecordingSet.from_recordings(iter(recordings))
    if supervisions.is_lazy:
        supervisions = SupervisionSet.from_segments(iter(supervisions))

    validate(recordings, read_data=read_data)
    validate(supervisions)
    # Errors
    for s in supervisions:
        r = recordings[s.recording_id]
        assert -1e-3 <= s.start <= s.end <= r.duration + 1e-3, (
            f"Supervision {s.id}: exceeded the bounds of its corresponding recording "
            f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])"
        )
        assert s.channel in r.channel_ids, (
            f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording "
            f"(recording channels: {r.channel_ids})"
        )
    # Warnings
    recording_ids = frozenset(r.id for r in recordings)
    recording_ids_in_sups = frozenset(s.recording_id for s in supervisions)
    only_in_recordings = recording_ids - recording_ids_in_sups
    if only_in_recordings:
        logging.warning(
            f"There are {len(only_in_recordings)} recordings that "
            f"do not have any corresponding supervisions in the SupervisionSet."
        )
    only_in_supervisions = recording_ids_in_sups - recording_ids
    if only_in_supervisions:
        logging.warning(
            f"There are {len(only_in_supervisions)} supervisions that "
            f"are missing their corresponding recordings in the RecordingSet."
        )
Exemple #5
0
def validate_supervision_set(supervisions: SupervisionSet, **kwargs) -> None:
    for s in supervisions:
        validate_supervision(s)

    # Catch errors in data preparation:
    # - more than one supervision for a given recording starts at 0
    supervisions._index_by_recording_id_and_cache()
    for rid, sups in supervisions._segments_by_recording_id.items():
        cntr = 0
        for s in sups:
            cntr += int(s.start == 0)
        if cntr > 1:
            logging.warning(
                f"SupervisionSet contains {cntr} supervisions that start at 0 for recording {rid}."
                f"Did you forget to set supervision start times?")
Exemple #6
0
def prepare_supervision_other(
        audio: RecordingSet,
        annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet:
    annotation_by_id = {(key[0]): annot for key, annot in annotations.items()}

    segments = []
    for recording in audio:
        annotation = annotation_by_id.get(recording.id)
        # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm)
        # will share supervision.
        source = recording.sources[0]
        if annotation is None:
            logging.warning(
                f'No annotation found for recording {recording.id}')
            continue

        if (len(source.channels) > 1):
            logging.warning(
                f'More than 1 channels in recording {recording.id}. '
                f'Creating supervision for channel 0 only.')

        for seg_idx, seg_info in enumerate(annotation):
            duration = seg_info.end_time - seg_info.begin_time
            if duration > 0:
                segments.append(
                    SupervisionSegment(id=f'{recording.id}-{seg_idx}',
                                       recording_id=recording.id,
                                       start=seg_info.begin_time,
                                       duration=duration,
                                       channel=0,
                                       language='English',
                                       speaker=seg_info.speaker,
                                       gender=seg_info.gender,
                                       text=seg_info.text))
    return SupervisionSet.from_segments(segments)
Exemple #7
0
def parse_and_add_sentiment_labels(sentiment_dir: Pathlike, supervisions: SupervisionSet):
    """Read 'LDC2020T14' sentiment annotations and add then to the supervision segments."""
    import pandas as pd
    # Sanity checks
    sentiment_dir = Path(sentiment_dir)
    labels = sentiment_dir / 'data' / 'sentiment_labels.tsv'
    assert sentiment_dir.is_dir() and labels.is_file()
    # Read the TSV as a dataframe
    df = pd.read_csv(labels, delimiter='\t', names=['id', 'start', 'end', 'sentiment'])
    # We are going to match the segments in LDC2020T14 with the ones we already
    # parsed from ISIP transcripts. We simply look which of the existing segments
    # fall into a sentiment-annotated time span. When doing it this way, we use
    # 51773 out of 52293 available sentiment annotations, which should be okay.
    for _, row in df.iterrows():
        call_id = row['id'].split('_')[0]
        matches = list(supervisions.find(
            recording_id=call_id,
            start_after=row['start'] - 1e-2,
            end_before=row['end'] + 1e-2,
        ))
        if not matches:
            continue
        labels = row['sentiment'].split('#')
        # SupervisionSegments returned from .find() are references to the ones in the
        # SupervisionSet, so we can just modify them. We use the "custom" field
        # to add the sentiment label. Since there were multiple annotators,
        # we add all available labels and leave it up to the user to disambiguate them.
        for segment in matches:
            segment.custom = {f'sentiment{i}': label for i, label in enumerate(labels)}
Exemple #8
0
def prepare_switchboard(
        audio_dir: Pathlike,
        transcripts_dir: Optional[Pathlike] = None,
        sentiment_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        omit_silence: bool = True,
        absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']})

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups
    )
    supervisions = SupervisionSet.from_segments(chain.from_iterable(
        make_segments(
            transcript_path=group[f'text-{channel}'],
            recording=recording,
            channel=channel,
            omit_silence=omit_silence
        )
        for group, recording in zip(groups, recordings)
        for channel in [0, 1]
    ))

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
Exemple #9
0
def trim_supervisions_to_recordings(
        recordings: RecordingSet,
        supervisions: SupervisionSet) -> SupervisionSet:
    """
    Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are
    not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`.
    """
    sups = []
    removed = 0
    trimmed = 0
    for s in supervisions:
        end = recordings[s.recording_id].duration
        if s.start > end:
            removed += 1
            continue
        if s.end > end:
            trimmed += 1
            s = s.trim(recordings[s.recording_id].duration)
        sups.append(s)
    if removed:
        logging.warning(
            f"Removed {removed} supervisions starting after the end of the recording."
        )
    if trimmed:
        logging.warning(
            f"Trimmed {trimmed} supervisions exceeding the end of the recording."
        )
    return SupervisionSet.from_segments(sups)
Exemple #10
0
def remove_missing_recordings_and_supervisions(
    recordings: RecordingSet,
    supervisions: SupervisionSet,
) -> Tuple[RecordingSet, SupervisionSet]:
    """
    Fix the recording and supervision manifests by removing all entries that
    miss their counterparts.

    :param recordings: a :class:`RecordingSet` object.
    :param supervisions: a :class:`RecordingSet` object.
    :return: A pair of :class:`RecordingSet` and :class:`SupervisionSet` with removed entries.
    """
    recording_ids = frozenset(r.id for r in recordings)
    recording_ids_in_sups = frozenset(s.recording_id for s in supervisions)
    only_in_recordings = recording_ids - recording_ids_in_sups
    if only_in_recordings:
        recordings = recordings.filter(
            lambda r: r.id not in only_in_recordings)
        logging.warning(
            f"Removed {len(only_in_recordings)} recordings with no corresponding supervisions."
        )
    only_in_supervisions = recording_ids_in_sups - recording_ids
    if only_in_supervisions:
        supervisions = supervisions.filter(
            lambda s: s.recording_id not in only_in_supervisions)
        logging.warning(
            f"Removed {len(only_in_supervisions)} supervisions with no corresponding recordings."
        )
    return recordings, supervisions
Exemple #11
0
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]:
    """
    Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the
    corresponding type. When the iterable is empty, returns None.
    """
    items = iter(items)
    try:
        first_item = next(items)
    except StopIteration:
        return None
    items = chain([first_item], items)

    if isinstance(first_item, Recording):
        return RecordingSet.from_recordings(items)
    if isinstance(first_item, SupervisionSegment):
        return SupervisionSet.from_segments(items)
    if isinstance(first_item, (Cut, MixedCut)):
        return CutSet.from_cuts(items)
    if isinstance(first_item, Features):
        raise ValueError(
            "FeatureSet generic construction from iterable is not possible, as the config information "
            "would have been lost. Call FeatureSet.from_features() directly instead."
        )

    raise ValueError(f"Unknown type of manifest item: {first_item}")
def search_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='s1', recording_id='r1', start=0, duration=5.0, channel=0),
        SupervisionSegment(id='s2', recording_id='r1', start=4.5, duration=2.0, channel=1),
        SupervisionSegment(id='s3', recording_id='r1', start=8.0, duration=3.0, channel=0),
        SupervisionSegment(id='s4', recording_id='r2', start=1, duration=5.0, channel=0),
    ])
Exemple #13
0
def random_mixed(supervision_manifest: Pathlike, feature_manifest: Pathlike,
                 output_cut_manifest: Pathlike, snr_range: Tuple[float, float],
                 offset_range: Tuple[float, float]):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains supervision regions from SUPERVISION_MANIFEST
    and features supplied by FEATURE_MANIFEST. It first creates a trivial CutSet, splits it into two equal, randomized
    parts and mixes their features.
    The parameters of the mix are controlled via SNR_RANGE and OFFSET_RANGE.
    """
    supervision_set = SupervisionSet.from_json(supervision_manifest)
    feature_set = FeatureSet.from_json(feature_manifest)

    source_cut_set = CutSet.from_manifests(supervisions=supervision_set,
                                           features=feature_set)
    left_cuts, right_cuts = source_cut_set.split(num_splits=2, shuffle=True)

    snrs = np.random.uniform(*snr_range, size=len(left_cuts)).tolist()
    relative_offsets = np.random.uniform(*offset_range,
                                         size=len(left_cuts)).tolist()

    mixed_cut_set = CutSet.from_cuts(
        left_cut.mix(right_cut,
                     offset_other_by=left_cut.duration * relative_offset,
                     snr=snr)
        for left_cut, right_cut, snr, relative_offset in zip(
            left_cuts, right_cuts, snrs, relative_offsets))
    mixed_cut_set.to_json(output_cut_manifest)
Exemple #14
0
def validate_supervision_set(supervisions: SupervisionSet, **kwargs) -> None:
    for s in supervisions:
        validate_supervision(s)

    # Catch errors in data preparation:
    # - more than one supervision for a given recording starts at 0 (in a given channel)
    supervisions._index_by_recording_id_and_cache()
    for rid, sups in supervisions._segments_by_recording_id.items():
        cntr_per_channel = defaultdict(int)
        for s in sups:
            cntr_per_channel[s.channel] += int(s.start == 0)
        for channel, count in cntr_per_channel.items():
            if count > 1:
                logging.warning(
                    f"SupervisionSet contains {count} supervisions that start at 0 for recording {rid} "
                    f"(channel {channel}). Did you forget to set supervision start times?"
                )
def test_supervision_set_iteration():
    supervision_set = SupervisionSet(
        segments={
            'X': SupervisionSegment(id='X', recording_id='X', channel=0, start=2.0, duration=2.5),
            'Y': SupervisionSegment(id='Y', recording_id='X', channel=0, start=5.0, duration=5.0),
        }
    )
    assert 2 == len(supervision_set)
    assert 2 == len(list(supervision_set))
Exemple #16
0
def dummy_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='sup1',
                           recording_id='rec1',
                           start=3,
                           duration=4,
                           channel=0,
                           text='dummy text')
    ])
Exemple #17
0
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest:
    if type_ == RecordingSet:
        return RecordingSet.from_recordings(
            dummy_recording(idx) for idx in range(begin_id, end_id))
    if type_ == SupervisionSet:
        return SupervisionSet.from_segments(
            dummy_supervision(idx) for idx in range(begin_id, end_id))
    if type_ == FeatureSet:
        # noinspection PyTypeChecker
        return FeatureSet.from_features(
            dummy_features(idx) for idx in range(begin_id, end_id))
Exemple #18
0
def dummy_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(
            id="sup1",
            recording_id="rec1",
            start=3,
            duration=4,
            channel=0,
            text="dummy text",
        )
    ])
Exemple #19
0
def test_supervision_set_from_rttm(tmpdir):
    rttm_str = """SPEAKER reco1 1 130.430000 2.350 <NA> <NA> juliet <NA> <NA>
                  SPEAKER reco1 1 157.610000 3.060 <NA> <NA> tbc <NA> <NA>
                  SPEAKER reco2 1 130.490000 0.450 <NA> <NA> chek <NA> <NA>"""
    tmpdir = Path(tmpdir)
    rttm_dir = tmpdir / "rttm"
    rttm_dir.mkdir()
    rttm_file = rttm_dir / "example.rttm"
    rttm_file.write_text(rttm_str)

    supervision_set = SupervisionSet.from_rttm(rttm_file)
    assert len(supervision_set) == 3
Exemple #20
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / "metadata.csv"
    assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}"
    recordings = []
    supervisions = []
    with open(metadata_csv_path) as f:
        for line in f:
            recording_id, text, _ = line.split("|")
            audio_path = corpus_dir / "wavs" / f"{recording_id}.wav"
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            segment = SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="English",
                gender="female",
                text=text,
            )
            recordings.append(recording)
            supervisions.append(segment)

    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recording_set, supervision_set)

    if output_dir is not None:
        supervision_set.to_json(output_dir / "supervisions.json")
        recording_set.to_json(output_dir / "recordings.json")

    return {"recordings": recording_set, "supervisions": supervision_set}
Exemple #21
0
def prepare_yesno(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
        are 8 x's and each x is either 1 or 0.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is either "train" or "test", and the value is
        Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    wave_files = list(corpus_dir.glob("*.wav"))
    assert len(wave_files) == 60

    wave_files.sort()
    train_set = wave_files[::2]
    test_set = wave_files[1::2]

    assert len(train_set) == 30
    assert len(test_set) == 30

    manifests = defaultdict(dict)
    for name, dataset in zip(["train", "test"], [train_set, test_set]):
        recordings, supervisions = _prepare_dataset(dataset)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{name}.json")
            recording_set.to_json(output_dir / f"recordings_{name}.json")

        manifests[name] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Exemple #22
0
def test_known_issue_with_overlap():
    r = dummy_recording(0)
    rec = RecordingSet.from_recordings([r])

    # Make two segments. The first segment is 1s long. The segment segment
    # is 0.3 seconds long and lies entirely within the first. Both have the
    # same recording_id as the single entry in rec.
    sup = SupervisionSet.from_segments(
        [
            SupervisionSegment(
                id="utt1",
                recording_id=r.id,
                start=0.0,
                duration=1.0,
                channel=0,
                text="Hello",
            ),
            SupervisionSegment(
                id="utt2",
                recording_id=r.id,
                start=0.2,
                duration=0.5,
                channel=0,
                text="World",
            ),
        ]
    )

    cuts = CutSet.from_manifests(recordings=rec, supervisions=sup)
    assert len(cuts) == 1

    cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False)
    assert len(cuts_trim) == 2

    cut = cuts_trim[0]
    assert cut.start == 0
    assert cut.duration == 1
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 1
    assert sup.text == "Hello"

    cut = cuts_trim[1]
    assert cut.start == 0.2
    assert cut.duration == 0.5
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 0.5
    assert sup.text == "World"
Exemple #23
0
def make_corresponding_supervisions(audio: RecordingSet) -> SupervisionSet:
    """
    Prepare a supervision set - in this case it just describes
    which segments are available in the corpus, as the actual supervisions for
    speech separation come from the source recordings.
    """
    return SupervisionSet.from_segments(
        SupervisionSegment(
            id=f"{recording.id}-c{source.channels[0]}",
            recording_id=recording.id,
            start=0.0,
            duration=recording.duration,
            channel=source.channels[0],
        ) for recording in audio for source in recording.sources)
Exemple #24
0
def prepare_gigaspeech(
        gigaspeech: Any,
        dataset_parts: Union[str, Sequence[str]] = 'auto',
        output_dir: Optional[Pathlike] = None,
        num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available('speechcolab'):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')

    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl')
        if maybe_manifests is not None:
            return maybe_manifests

    manifests = defaultdict(dict)
    with ThreadPoolExecutor(num_jobs) as ex:
        for part in subsets:
            futures = []
            for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False):
                futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path))

            recordings = []
            supervisions = []
            for future in tqdm(futures, desc='Processing', leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segments = result
                recordings.append(recording)
                supervisions += segments

            manifests[part] = {
                'recordings': RecordingSet.from_recordings(recordings),
                'supervisions': SupervisionSet.from_segments(supervisions)
            }

            if output_dir is not None:
                manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl')
                manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl')

    return dict(manifests)
Exemple #25
0
def split(manifest: Manifest,
          num_splits: int,
          randomize: bool = False) -> List[Manifest]:
    """Split a manifest into `num_splits` equal parts. The element order can be randomized."""
    num_items = len(manifest)
    if num_splits > num_items:
        raise ValueError(
            f"Cannot split manifest into more chunks ({num_splits}) than its number of items {num_items}"
        )
    chunk_size = int(ceil(num_items / num_splits))
    split_indices = [(i * chunk_size, min(num_items, (i + 1) * chunk_size))
                     for i in range(num_splits)]

    def maybe_randomize(items: Iterable[Any]) -> List[Any]:
        items = list(items)
        if randomize:
            random.shuffle(items)
        return items

    if isinstance(manifest, RecordingSet):
        contents = maybe_randomize(manifest.recordings.items())
        return [
            RecordingSet(recordings=dict(contents[begin:end]))
            for begin, end in split_indices
        ]

    if isinstance(manifest, SupervisionSet):
        contents = maybe_randomize(manifest.segments.items())
        return [
            SupervisionSet(segments=dict(contents[begin:end]))
            for begin, end in split_indices
        ]

    if isinstance(manifest, FeatureSet):
        contents = maybe_randomize(manifest.features)
        return [
            FeatureSet(features=contents[begin:end],
                       feature_extractor=manifest.feature_extractor)
            for begin, end in split_indices
        ]

    if isinstance(manifest, CutSet):
        contents = maybe_randomize(manifest.cuts.items())
        return [
            CutSet(cuts=dict(contents[begin:end]))
            for begin, end in split_indices
        ]

    raise ValueError(f"Unknown type of manifest: {type(manifest)}")
Exemple #26
0
def prepare_supervision_ihm(
    audio: RecordingSet,
    annotations: Dict[str, List[IcsiSegmentAnnotation]],
    channel_to_idx_map: Dict[str, Dict[str, int]],
) -> SupervisionSet:
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {
        (key[0], channel_to_idx_map[key[0]][key[2]]): annotations[key]
        for key in annotations
    }

    segments = []
    for recording in tqdm(audio, desc="Preparing supervision"):
        # IHM can have multiple audio sources for each recording
        for source in recording.sources:
            # For each source, "channels" will always be a one-element list
            (channel, ) = source.channels
            annotation = annotation_by_id_and_channel.get(
                (recording.id, channel))

            if annotation is None:
                continue

            for seg_idx, seg_info in enumerate(annotation):
                duration = seg_info.end_time - seg_info.start_time
                # Some annotations in IHM setting exceed audio duration, so we
                # ignore such segments
                if seg_info.end_time > recording.duration:
                    logging.warning(
                        f"Segment {recording.id}-{channel}-{seg_idx} exceeds "
                        f"recording duration. Not adding to supervisions.")
                    continue
                if duration > 0:
                    segments.append(
                        SupervisionSegment(
                            id=f"{recording.id}-{channel}-{seg_idx}",
                            recording_id=recording.id,
                            start=seg_info.start_time,
                            duration=duration,
                            channel=channel,
                            language="English",
                            speaker=seg_info.speaker,
                            gender=seg_info.gender,
                            text=seg_info.text,
                        ))

    return SupervisionSet.from_segments(segments)
Exemple #27
0
def simple(
        feature_manifest: Pathlike,
        output_cut_manifest: Pathlike,
        supervision_manifest: Optional[Pathlike],
):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains the regions and features supplied by FEATURE_MANIFEST.
    Optionally it can use a SUPERVISION_MANIFEST to select the regions and attach the corresponding supervisions to
    the cuts. This is the simplest way to create Cuts.
    """
    feature_set = FeatureSet.from_yaml(feature_manifest)
    if supervision_manifest is None:
        cut_set = make_cuts_from_features(feature_set)
    else:
        supervision_set = SupervisionSet.from_yaml(supervision_manifest)
        cut_set = make_cuts_from_supervisions(feature_set=feature_set, supervision_set=supervision_set)
    cut_set.to_yaml(output_cut_manifest)
Exemple #28
0
def prepare_supervision_ihm(
        audio: RecordingSet,
        annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet:
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {(key[0], key[2]): annotations[key]
                                    for key in annotations}

    segments = []
    for recording in audio:
        # AMI IHM can have multiple audio sources for each recording
        for source in recording.sources:
            # For each source, "channels" will always be a one-element list
            channel, = source.channels
            annotation = annotation_by_id_and_channel.get(
                (recording.id, channel))
            if annotation is None:
                logging.warning(
                    f'No annotation found for recording {recording.id} '
                    f'(file {source.source})')
                continue

            for seg_idx, seg_info in enumerate(annotation):
                duration = seg_info.end_time - seg_info.begin_time
                # Some annotations in IHM setting exceed audio duration, so we
                # ignore such segments
                if seg_info.end_time > recording.duration:
                    logging.warning(
                        f'Segment {recording.id}-{channel}-{seg_idx} exceeds '
                        f'recording duration. Not adding to supervisions.')
                    continue
                if duration > 0:
                    segments.append(
                        SupervisionSegment(
                            id=f'{recording.id}-{channel}-{seg_idx}',
                            recording_id=recording.id,
                            start=seg_info.begin_time,
                            duration=duration,
                            channel=channel,
                            language='English',
                            speaker=seg_info.speaker,
                            gender=seg_info.gender,
                            text=seg_info.text))

    return SupervisionSet.from_segments(segments)
Exemple #29
0
def test_supervision_set_serialization():
    supervision_set = SupervisionSet.from_segments([
        SupervisionSegment(
            id='segment-1',
            recording_id='recording-1',
            channel_id=0,
            start=0.1,
            duration=0.3,
            text='transcript of the first segment',
            language='english',
            speaker='Norman Dyhrentfurth',
            gender='male'
        )
    ])
    with NamedTemporaryFile() as f:
        supervision_set.to_yaml(f.name)
        restored = supervision_set.from_yaml(f.name)
    assert supervision_set == restored
Exemple #30
0
def test_supervision_set_serialization(format, compressed):
    supervision_set = SupervisionSet.from_segments([
        SupervisionSegment(id='segment-1',
                           recording_id='recording-1',
                           channel=0,
                           start=0.1,
                           duration=0.3,
                           text='transcript of the first segment',
                           language='english',
                           speaker='Norman Dyhrentfurth',
                           gender='male')
    ])
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'yaml':
            supervision_set.to_yaml(f.name)
            restored = supervision_set.from_yaml(f.name)
        if format == 'json':
            supervision_set.to_json(f.name)
            restored = supervision_set.from_json(f.name)
    assert supervision_set == restored