Exemple #1
0
def test_serialization():
    audio_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channel_ids=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channel_ids=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration_seconds=0.5
        )
    ])
    with NamedTemporaryFile() as f:
        audio_set.to_yaml(f.name)
        deserialized = RecordingSet.from_yaml(f.name)
    assert deserialized == audio_set
def test_serialization(format, compressed):
    recording_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channels=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration=0.5
        )
    ])
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'yaml':
            recording_set.to_yaml(f.name)
            deserialized = RecordingSet.from_yaml(f.name)
        if format == 'json':
            recording_set.to_json(f.name)
            deserialized = RecordingSet.from_json(f.name)
    assert deserialized == recording_set
Exemple #3
0
def prepare_single_commonvoice_tsv(
    lang: str,
    part: str,
    output_dir: Pathlike,
    lang_path: Pathlike,
) -> Tuple[RecordingSet, SupervisionSet]:
    """
    Prepares part of CommonVoice data from a single TSV file.

    :param lang: string language code (e.g., "en").
    :param part: which split to prepare (e.g., "train", "validated", etc.).
    :param output_dir: path to directory where we will store the manifests.
    :param lang_path: path to a CommonVoice directory for a specific language
        (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl").
    :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode,
        as CommonVoice manifests may be fairly large in memory.
    """
    if not is_module_available("pandas"):
        raise ValueError(
            "To prepare CommonVoice data, please 'pip install pandas' first.")
    import pandas as pd

    lang_path = Path(lang_path)
    output_dir = Path(output_dir)
    tsv_path = lang_path / f"{part}.tsv"

    # Read the metadata
    df = pd.read_csv(tsv_path, sep="\t")
    # Scan all the audio files
    with RecordingSet.open_writer(
            output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as recs_writer, SupervisionSet.open_writer(
            output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as sups_writer:
        for idx, row in tqdm(
                df.iterrows(),
                desc="Processing audio files",
                total=len(df),
        ):
            try:
                result = parse_utterance(row, lang_path, lang)
                if result is None:
                    continue
                recording, segment = result
                validate_recordings_and_supervisions(recording, segment)
                recs_writer.write(recording)
                sups_writer.write(segment)
            except Exception as e:
                logging.error(
                    f"Error when processing TSV file: line no. {idx}: '{row}'.\n"
                    f"Original error type: '{type(e)}' and message: {e}")
                continue
    recordings = RecordingSet.from_jsonl_lazy(recs_writer.path)
    supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path)
    return recordings, supervisions
Exemple #4
0
def validate_recordings_and_supervisions(
    recordings: Union[RecordingSet, Recording],
    supervisions: Union[SupervisionSet, SupervisionSegment],
    read_data: bool = False,
) -> None:
    """
    Validate the recording and supervision manifests separately,
    and then check if they are consistent with each other.

    This method will emit warnings, instead of errors, when some recordings or supervisions
    are missing their counterparts.
    These items will be discarded by default when creating a CutSet.
    """
    if isinstance(recordings, Recording):
        recordings = RecordingSet.from_recordings([recordings])
    if isinstance(supervisions, SupervisionSegment):
        supervisions = SupervisionSet.from_segments([supervisions])

    if recordings.is_lazy:
        recordings = RecordingSet.from_recordings(iter(recordings))
    if supervisions.is_lazy:
        supervisions = SupervisionSet.from_segments(iter(supervisions))

    validate(recordings, read_data=read_data)
    validate(supervisions)
    # Errors
    for s in supervisions:
        r = recordings[s.recording_id]
        assert -1e-3 <= s.start <= s.end <= r.duration + 1e-3, (
            f"Supervision {s.id}: exceeded the bounds of its corresponding recording "
            f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])"
        )
        assert s.channel in r.channel_ids, (
            f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording "
            f"(recording channels: {r.channel_ids})"
        )
    # Warnings
    recording_ids = frozenset(r.id for r in recordings)
    recording_ids_in_sups = frozenset(s.recording_id for s in supervisions)
    only_in_recordings = recording_ids - recording_ids_in_sups
    if only_in_recordings:
        logging.warning(
            f"There are {len(only_in_recordings)} recordings that "
            f"do not have any corresponding supervisions in the SupervisionSet."
        )
    only_in_supervisions = recording_ids_in_sups - recording_ids
    if only_in_supervisions:
        logging.warning(
            f"There are {len(only_in_supervisions)} supervisions that "
            f"are missing their corresponding recordings in the RecordingSet."
        )
Exemple #5
0
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    recordings = []
    for session_name, channel_paths in channel_wavs.items():
        audio_sf = sf.SoundFile(str(channel_paths[0]))

        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(type="file",
                                channels=[idx],
                                source=str(audio_path))
                    for idx, audio_path in enumerate(sorted(channel_paths))
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemple #6
0
def trim_supervisions_to_recordings(
    recordings: RecordingSet, supervisions: SupervisionSet
) -> SupervisionSet:
    """
    Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are
    not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`.
    """
    if recordings.is_lazy:
        recordings = RecordingSet.from_recordings(iter(recordings))

    sups = []
    removed = 0
    trimmed = 0
    for s in supervisions:
        end = recordings[s.recording_id].duration
        if s.start > end:
            removed += 1
            continue
        if s.end > end:
            trimmed += 1
            s = s.trim(recordings[s.recording_id].duration)
        sups.append(s)
    if removed:
        logging.warning(
            f"Removed {removed} supervisions starting after the end of the recording."
        )
    if trimmed:
        logging.warning(
            f"Trimmed {trimmed} supervisions exceeding the end of the recording."
        )
    return SupervisionSet.from_segments(sups)
Exemple #7
0
def dummy_recording_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        recs = RecordingSet.from_recordings([
            Recording(
                id="rec1",
                sampling_rate=16000,
                num_samples=160000,
                duration=10,
                sources=[
                    AudioSource(type="file", channels=[0], source="dummy.wav")
                ],
            )
        ])
        recs.to_file(f.name)
        f.flush()
        yield RecordingSet.from_jsonl_lazy(f.name)
Exemple #8
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recordings = []
    for audio_path in tqdm(audio_paths, desc="Preparing audio"):
        session_name = audio_path.parts[-2]
        if audio_path.suffix == ".wav":
            audio_sf = sf.SoundFile(str(audio_path))
            num_frames = audio_sf.frames
            num_channels = audio_sf.channels
            samplerate = audio_sf.samplerate
        else:
            audio_sf, samplerate = read_sph(audio_path)
            num_channels, num_frames = audio_sf.shape
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=list(range(num_channels)),
                        source=str(audio_path),
                    )
                ],
                sampling_rate=samplerate,
                num_samples=num_frames,
                duration=num_frames / samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemple #9
0
def remove_missing_recordings_and_supervisions(
    recordings: RecordingSet,
    supervisions: SupervisionSet,
) -> Tuple[RecordingSet, SupervisionSet]:
    """
    Fix the recording and supervision manifests by removing all entries that
    miss their counterparts.

    :param recordings: a :class:`RecordingSet` object.
    :param supervisions: a :class:`RecordingSet` object.
    :return: A pair of :class:`RecordingSet` and :class:`SupervisionSet` with removed entries.
    """
    recording_ids = frozenset(r.id for r in recordings)
    recording_ids_in_sups = frozenset(s.recording_id for s in supervisions)
    only_in_recordings = recording_ids - recording_ids_in_sups
    if only_in_recordings:
        recordings = recordings.filter(
            lambda r: r.id not in only_in_recordings)
        logging.warning(
            f"Removed {len(only_in_recordings)} recordings with no corresponding supervisions."
        )
    only_in_supervisions = recording_ids_in_sups - recording_ids
    if only_in_supervisions:
        supervisions = supervisions.filter(
            lambda s: s.recording_id not in only_in_supervisions)
        logging.warning(
            f"Removed {len(only_in_supervisions)} supervisions with no corresponding recordings."
        )
    return recordings, supervisions
Exemple #10
0
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]:
    """
    Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the
    corresponding type. When the iterable is empty, returns None.
    """
    items = iter(items)
    try:
        first_item = next(items)
    except StopIteration:
        return None
    items = chain([first_item], items)

    if isinstance(first_item, Recording):
        return RecordingSet.from_recordings(items)
    if isinstance(first_item, SupervisionSegment):
        return SupervisionSet.from_segments(items)
    if isinstance(first_item, (Cut, MixedCut)):
        return CutSet.from_cuts(items)
    if isinstance(first_item, Features):
        raise ValueError(
            "FeatureSet generic construction from iterable is not possible, as the config information "
            "would have been lost. Call FeatureSet.from_features() directly instead."
        )

    raise ValueError(f"Unknown type of manifest item: {first_item}")
Exemple #11
0
def extract(recording_manifest: Pathlike, output_dir: Pathlike,
            feature_manifest: Optional[Pathlike], storage_type: str,
            lilcom_tick_power: int, root_dir: Optional[Pathlike],
            num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    recordings: RecordingSet = RecordingSet.from_json(recording_manifest)
    if root_dir is not None:
        recordings = recordings.with_path_prefix(root_dir)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage'

    with get_writer(storage_type)(storage_path,
                                  tick_power=lilcom_tick_power) as storage:
        feature_set_builder = FeatureSetBuilder(
            feature_extractor=feature_extractor,
            storage=storage,
        )
        feature_set_builder.process_and_store_recordings(
            recordings=recordings,
            output_manifest=output_dir / 'feature_manifest.json.gz',
            num_jobs=num_jobs)
Exemple #12
0
def prepare_switchboard(
        audio_dir: Pathlike,
        transcripts_dir: Optional[Pathlike] = None,
        sentiment_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        omit_silence: bool = True,
        absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']})

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups
    )
    supervisions = SupervisionSet.from_segments(chain.from_iterable(
        make_segments(
            transcript_path=group[f'text-{channel}'],
            recording=recording,
            channel=channel,
            omit_silence=omit_silence
        )
        for group, recording in zip(groups, recordings)
        for channel in [0, 1]
    ))

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
Exemple #13
0
def test_feature_set_builder():
    audio_set = RecordingSet.from_yaml('test/fixtures/audio.yml')
    with TemporaryDirectory() as output_dir:
        builder = FeatureSetBuilder(feature_extractor=FeatureExtractor(),
                                    output_dir=output_dir)
        feature_set = builder.process_and_store_recordings(
            recordings=audio_set)

    assert len(feature_set) == 4

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == 'fbank'
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.mfcc_fbank_common_config.num_mel_bins
        # assert that lilcom is the default storate type
        assert features.storage_type == 'lilcom'

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
Exemple #14
0
def test_cut_set_reverb_rir_doesnt_duplicate_transforms(cut_with_supervision, rir):
    rirs = RecordingSet.from_recordings([rir])
    cuts = CutSet.from_cuts(
        [cut_with_supervision, cut_with_supervision.with_id("other-id")]
    )
    cuts_vp = cuts.reverb_rir(rir_recordings=rirs)
    for cut in cuts_vp:
        # This prevents a bug regression where multiple cuts referencing the same recording would
        # attach transforms to the same manifest
        assert len(cut.recording.transforms) == 1
def prepare_broadcast_news(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    """
    audio_paths = check_and_rglob(audio_dir, "*.sph")
    sgml_paths = check_and_rglob(transcripts_dir, "*.sgml")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths
    )

    # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation;
    # on some systems the recursion limit needs to be raised for this to work.
    with recursion_limit(5000):
        supervisions_list = [
            make_supervisions(p, r) for p, r in zip(sgml_paths, recordings)
        ]
    section_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["sections"] for sups in supervisions_list)
    )
    segment_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["segments"] for sups in supervisions_list)
    )

    validate_recordings_and_supervisions(recordings, segment_supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz")
        section_supervisions.to_file(
            output_dir / "broadcast-news_sections_all.jsonl.gz"
        )
        segment_supervisions.to_file(
            output_dir / "broadcast-news_segments_all.jsonl.gz"
        )

    return {
        "recordings": recordings,
        "sections": section_supervisions,
        "segments": segment_supervisions,
    }
Exemple #16
0
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest:
    if type_ == RecordingSet:
        return RecordingSet.from_recordings(
            dummy_recording(idx) for idx in range(begin_id, end_id))
    if type_ == SupervisionSet:
        return SupervisionSet.from_segments(
            dummy_supervision(idx) for idx in range(begin_id, end_id))
    if type_ == FeatureSet:
        # noinspection PyTypeChecker
        return FeatureSet.from_features(
            dummy_features(idx) for idx in range(begin_id, end_id))
Exemple #17
0
def dummy_recording_set():
    return RecordingSet.from_recordings([
        Recording(id='rec1',
                  sampling_rate=16000,
                  num_samples=160000,
                  duration=10,
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source='dummy.wav')
                  ])
    ])
Exemple #18
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / "metadata.csv"
    assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}"
    recordings = []
    supervisions = []
    with open(metadata_csv_path) as f:
        for line in f:
            recording_id, text, _ = line.split("|")
            audio_path = corpus_dir / "wavs" / f"{recording_id}.wav"
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            segment = SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="English",
                gender="female",
                text=text,
            )
            recordings.append(recording)
            supervisions.append(segment)

    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recording_set, supervision_set)

    if output_dir is not None:
        supervision_set.to_json(output_dir / "supervisions.json")
        recording_set.to_json(output_dir / "recordings.json")

    return {"recordings": recording_set, "supervisions": supervision_set}
Exemple #19
0
def dummy_recording_set():
    return RecordingSet.from_recordings([
        Recording(
            id="rec1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10,
            sources=[
                AudioSource(type="file", channels=[0], source="dummy.wav")
            ],
        )
    ])
Exemple #20
0
def test_cut_set_reverb_rir(libri_cut_set, rir, affix_id):
    rirs = RecordingSet.from_recordings([rir])
    perturbed_rvb_cs = libri_cut_set.reverb_rir(rirs, affix_id=affix_id)
    for original, perturbed_rvb in zip(libri_cut_set, perturbed_rvb_cs):
        if affix_id:
            assert original.id != perturbed_rvb.id
            assert perturbed_rvb.id.endswith(f"_rvb")
        else:
            assert original.id == perturbed_rvb.id
        assert original.sampling_rate == perturbed_rvb.sampling_rate
        assert original.num_samples == perturbed_rvb.num_samples
        assert original.load_audio().shape == perturbed_rvb.load_audio().shape
Exemple #21
0
def prepare_yesno(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
        are 8 x's and each x is either 1 or 0.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is either "train" or "test", and the value is
        Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    wave_files = list(corpus_dir.glob("*.wav"))
    assert len(wave_files) == 60

    wave_files.sort()
    train_set = wave_files[::2]
    test_set = wave_files[1::2]

    assert len(train_set) == 30
    assert len(test_set) == 30

    manifests = defaultdict(dict)
    for name, dataset in zip(["train", "test"], [train_set, test_set]):
        recordings, supervisions = _prepare_dataset(dataset)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{name}.json")
            recording_set.to_json(output_dir / f"recordings_{name}.json")

        manifests[name] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Exemple #22
0
def split(manifest: Manifest,
          num_splits: int,
          randomize: bool = False) -> List[Manifest]:
    """Split a manifest into `num_splits` equal parts. The element order can be randomized."""
    num_items = len(manifest)
    if num_splits > num_items:
        raise ValueError(
            f"Cannot split manifest into more chunks ({num_splits}) than its number of items {num_items}"
        )
    chunk_size = int(ceil(num_items / num_splits))
    split_indices = [(i * chunk_size, min(num_items, (i + 1) * chunk_size))
                     for i in range(num_splits)]

    def maybe_randomize(items: Iterable[Any]) -> List[Any]:
        items = list(items)
        if randomize:
            random.shuffle(items)
        return items

    if isinstance(manifest, RecordingSet):
        contents = maybe_randomize(manifest.recordings.items())
        return [
            RecordingSet(recordings=dict(contents[begin:end]))
            for begin, end in split_indices
        ]

    if isinstance(manifest, SupervisionSet):
        contents = maybe_randomize(manifest.segments.items())
        return [
            SupervisionSet(segments=dict(contents[begin:end]))
            for begin, end in split_indices
        ]

    if isinstance(manifest, FeatureSet):
        contents = maybe_randomize(manifest.features)
        return [
            FeatureSet(features=contents[begin:end],
                       feature_extractor=manifest.feature_extractor)
            for begin, end in split_indices
        ]

    if isinstance(manifest, CutSet):
        contents = maybe_randomize(manifest.cuts.items())
        return [
            CutSet(cuts=dict(contents[begin:end]))
            for begin, end in split_indices
        ]

    raise ValueError(f"Unknown type of manifest: {type(manifest)}")
Exemple #23
0
def prepare_gigaspeech(
        gigaspeech: Any,
        dataset_parts: Union[str, Sequence[str]] = 'auto',
        output_dir: Optional[Pathlike] = None,
        num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available('speechcolab'):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')

    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl')
        if maybe_manifests is not None:
            return maybe_manifests

    manifests = defaultdict(dict)
    with ThreadPoolExecutor(num_jobs) as ex:
        for part in subsets:
            futures = []
            for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False):
                futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path))

            recordings = []
            supervisions = []
            for future in tqdm(futures, desc='Processing', leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segments = result
                recordings.append(recording)
                supervisions += segments

            manifests[part] = {
                'recordings': RecordingSet.from_recordings(recordings),
                'supervisions': SupervisionSet.from_segments(supervisions)
            }

            if output_dir is not None:
                manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl')
                manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl')

    return dict(manifests)
Exemple #24
0
def test_feature_set_builder(storage_fn):
    recordings: RecordingSet = RecordingSet.from_json(
        "test/fixtures/audio.json")
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with storage_fn() as storage:
        builder = FeatureSetBuilder(
            feature_extractor=extractor,
            storage=storage,
        )
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

    assert len(feature_set) == 6

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == "kaldi-fbank"
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.config.num_filters
        # assert that the storage type metadata matches
        assert features.storage_type == storage.name
        # assert that the metadata is consistent with the data shapes
        arr = features.load()
        assert arr.shape[0] == features.num_frames
        assert arr.shape[1] == features.num_features
        # assert that the stored features are the same as the "freshly extracted" features
        recording = recordings[features.recording_id]
        expected = extractor.extract(
            samples=recording.load_audio(channels=features.channels),
            sampling_rate=recording.sampling_rate,
        )
        np.testing.assert_almost_equal(arr, expected, decimal=2)

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
Exemple #25
0
    def load_audio(self,
                   recording_set: RecordingSet,
                   root_dir: Optional[Pathlike] = None) -> np.ndarray:
        """
        Load the audio by locating the appropriate recording in the supplied RecordingSet.
        The audio is trimmed to the [begin, end] range specified by the Cut.
        Optionally specify a `root_dir` prefix to prefix the features path with.

        :param recording_set: RecordingSet object containing the Recording pointed to by recording_id
            member of this Cut.
        :param root_dir: optional Path prefix to find the recording in the filesystem.
        :return: a numpy ndarray with audio samples, with shape (1 <channel>, N <samples>)
        """
        return recording_set.load_audio(self.recording_id,
                                        channels=self.channel,
                                        offset_seconds=self.start,
                                        duration_seconds=self.duration,
                                        root_dir=root_dir)
Exemple #26
0
def test_feature_set_builder_with_augmentation():
    recordings: RecordingSet = RecordingSet.from_json(
        'test/fixtures/audio.json')
    augment_fn = WavAugmenter.create_predefined('pitch_reverb_tdrop',
                                                sampling_rate=8000)
    extractor = Fbank()
    with TemporaryDirectory() as d, LilcomFilesWriter(d) as storage:
        builder = FeatureSetBuilder(feature_extractor=extractor,
                                    storage=storage,
                                    augment_fn=augment_fn)
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

        assert len(feature_set) == 6

        feature_infos = list(feature_set)

        # Assert the properties shared by all features
        for features in feature_infos:
            # assert that fbank is the default feature type
            assert features.type == 'fbank'
            # assert that duration is always a multiple of frame_shift
            assert features.num_frames == round(features.duration /
                                                features.frame_shift)
            # assert that num_features is preserved
            assert features.num_features == builder.feature_extractor.config.num_mel_bins
            # assert that the storage type metadata matches
            assert features.storage_type == storage.name
            # assert that the metadata is consistent with the data shapes
            arr = features.load()
            assert arr.shape[0] == features.num_frames
            assert arr.shape[1] == features.num_features

        # Assert the properties for recordings of duration 0.5 seconds
        for features in feature_infos[:2]:
            assert features.num_frames == 50
            assert features.duration == 0.5

        # Assert the properties for recordings of duration 1.0 seconds
        for features in feature_infos[2:]:
            assert features.num_frames == 100
            assert features.duration == 1.0
Exemple #27
0
def make_feats(
        audio_manifest: Pathlike,
        output_dir: Pathlike,
        segmentation_manifest: Optional[Pathlike],
        # TODO: augmentation manifest should specify a number of transforms and probability of their application
        # e.g.:
        # "add_noise", "prob": 0.5, "noise_recordings": ["path1.wav", "path2.wav"]
        # "reverberate", "prob": 0.2, "rirs": ["rir1.wav", "rir2.wav"] (or however the RIRs are stored like... can be params for simulation)
        augmentation_manifest: Optional[Pathlike],
        feature_manifest: Optional[Pathlike],
        compressed: bool,
        lilcom_tick_power: int,
        root_dir: Optional[Pathlike],
        num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    audio_set = RecordingSet.from_yaml(audio_manifest)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if
                         feature_manifest is not None else FeatureExtractor())

    # TODO: to be used (actually, only the segmentation info will be used, and all supervision info will be ignored)
    supervision_set = (SupervisionSet.from_yaml(segmentation_manifest)
                       if segmentation_manifest is not None else None)

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)

    feature_set_builder = FeatureSetBuilder(
        feature_extractor=feature_extractor,
        output_dir=output_dir,
        root_dir=root_dir,
        augmentation_manifest=augmentation_manifest)
    feature_set_builder.process_and_store_recordings(
        recordings=audio_set,
        segmentation=None,  # TODO: implement and use
        compressed=compressed,
        lilcom_tick_power=lilcom_tick_power,
        num_jobs=num_jobs)
Exemple #28
0
def prepare_audio_grouped(
    audio_paths: List[Pathlike],
    channel_to_idx_map: Dict[str, Dict[str, int]] = None,
) -> RecordingSet:

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-2], audio_paths)

    if channel_to_idx_map is None:
        channel_to_idx_map = defaultdict(dict)
    recordings = []
    for session_name, channel_paths in tqdm(channel_wavs.items(),
                                            desc="Preparing audio"):
        if session_name not in channel_to_idx_map:
            channel_to_idx_map[session_name] = {
                c: idx
                for idx, c in enumerate(["chanE", "chanF", "chan6", "chan7"])
            }
        audio_sf, samplerate = read_sph(channel_paths[0])

        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=[
                            channel_to_idx_map[session_name][audio_path.stem]
                        ],
                        source=str(audio_path),
                    ) for audio_path in sorted(channel_paths)
                    if audio_path.stem in channel_to_idx_map[session_name]
                ],
                sampling_rate=samplerate,
                num_samples=audio_sf.shape[1],
                duration=audio_sf.shape[1] / samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemple #29
0
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    recordings = []
    for session_name, channel_paths in tqdm(channel_wavs.items(),
                                            desc="Processing audio files"):
        audio_sf = sf.SoundFile(str(channel_paths[0]))

        sources = []
        all_mono = True
        for idx, audio_path in enumerate(sorted(channel_paths)):
            audio = sf.SoundFile(str(audio_path))
            if audio.channels > 1:
                logging.warning(
                    f"Skipping recording {session_name} since it has a stereo channel"
                )
                all_mono = False
                break
            sources.append(
                AudioSource(type="file",
                            channels=[idx],
                            source=str(audio_path)))

        if not all_mono:
            continue

        recordings.append(
            Recording(
                id=session_name,
                sources=sources,
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemple #30
0
 def __init__(
     self,
     cuts: CutSet,
     uem: Optional[SupervisionSet] = None,
     min_speaker_dim: Optional[int] = None,
     global_speaker_ids: bool = False,
 ) -> None:
     super().__init__()
     validate(cuts)
     if not uem:
         self.cuts = cuts
     else:
         # We use the `overlap` method in intervaltree to get overlapping regions
         # between the supervision segments and the UEM segments
         recordings = RecordingSet(
             {c.recording.id: c.recording
              for c in cuts if c.has_recording})
         uem_intervals = CutSet.from_manifests(
             recordings=recordings,
             supervisions=uem,
         ).index_supervisions()
         supervisions = []
         for cut_id, tree in cuts.index_supervisions().items():
             if cut_id not in uem_intervals:
                 supervisions += [it.data for it in tree]
                 continue
             supervisions += {
                 it.data.trim(it.end, start=it.begin)
                 for uem_it in uem_intervals[cut_id]
                 for it in tree.overlap(begin=uem_it.begin, end=uem_it.end)
             }
         self.cuts = CutSet.from_manifests(
             recordings=recordings,
             supervisions=SupervisionSet.from_segments(supervisions),
         )
     self.speakers = ({
         spk: idx
         for idx, spk in enumerate(self.cuts.speakers)
     } if global_speaker_ids else None)
     self.min_speaker_dim = min_speaker_dim