Ejemplo n.º 1
0
def deserialize_item(data: dict) -> Any:
    # Figures out what type of manifest is being decoded with some heuristics
    # and returns a Lhotse manifest object rather than a raw dict.
    from lhotse import Features, MonoCut, Recording, SupervisionSegment
    from lhotse.array import deserialize_array
    from lhotse.cut import MixedCut

    if "shape" in data or "array" in data:
        return deserialize_array(data)
    if "sources" in data:
        return Recording.from_dict(data)
    if "num_features" in data:
        return Features.from_dict(data)
    if "type" not in data:
        return SupervisionSegment.from_dict(data)
    cut_type = data.pop("type")
    if cut_type == "MonoCut":
        return MonoCut.from_dict(data)
    if cut_type == "Cut":
        warnings.warn(
            "Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. "
            "Please re-generate it with Lhotse v0.8 as it might stop working in a future version "
            "(using manifest.from_file() and then manifest.to_file() should be sufficient)."
        )
        return MonoCut.from_dict(data)
    if cut_type == "MixedCut":
        return MixedCut.from_dict(data)
    raise ValueError(
        f"Unexpected cut type during deserialization: '{cut_type}'")
Ejemplo n.º 2
0
def cut_with_relative_paths():
    return Cut('cut', 0, 10, 0,
               features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000,
                                 storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc',
                                 start=0,
                                 duration=10),
               recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0)
               )
Ejemplo n.º 3
0
def _upload_one(item: Features, url: str) -> Features:
    feats_mtx = item.load()
    feats_writer = LilcomURLWriter(url)
    new_key = feats_writer.write(key=item.storage_key, value=feats_mtx)
    return fastcopy(item,
                    storage_path=url,
                    storage_key=new_key,
                    storage_type=feats_writer.name)
Ejemplo n.º 4
0
def test_validate_features_consistent_num_frames_does_not_raise():
    manifest = Features(
        type='irrelevant',
        num_frames=100,
        num_features=40,
        frame_shift=0.01,
        sampling_rate=16000,
        start=0.0,
        duration=1.0,
        storage_type='irrelevant',
        storage_path='irrelevant',
        storage_key='irrelevant',
    )
    validate_features(manifest)
Ejemplo n.º 5
0
def test_validate_features_inconsistent_num_frames_raises():
    manifest = Features(
        type='irrelevant',
        num_frames=101,
        num_features=40,
        frame_shift=0.01,
        sampling_rate=16000,
        start=0.0,
        duration=1.0,
        storage_type='irrelevant',
        storage_path='irrelevant',
        storage_key='irrelevant',
    )
    with pytest.raises(AssertionError):
        validate_features(manifest)
Ejemplo n.º 6
0
def feature_set():
    return FeatureSet(features=[
        Features(recording_id='irrelevant',
                 channels=0,
                 start=0.0,
                 duration=20.0,
                 type='fbank',
                 num_frames=2000,
                 num_features=20,
                 frame_shift=0.01,
                 sampling_rate=16000,
                 storage_type='lilcom',
                 storage_path='/irrelevant/',
                 storage_key='path.llc')
    ])
Ejemplo n.º 7
0
def cut_set():
    cut = MonoCut(
        id="cut-1",
        start=0.0,
        duration=10.0,
        channel=0,
        features=Features(
            type="fbank",
            num_frames=100,
            num_features=40,
            frame_shift=0.01,
            sampling_rate=16000,
            start=0.0,
            duration=10.0,
            storage_type="lilcom",
            storage_path="irrelevant",
            storage_key="irrelevant",
        ),
        recording=Recording(
            id="rec-1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10.0,
            sources=[
                AudioSource(type="file", channels=[0], source="irrelevant")
            ],
        ),
        supervisions=[
            SupervisionSegment(id="sup-1",
                               recording_id="irrelevant",
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id="sup-2",
                               recording_id="irrelevant",
                               start=7.0,
                               duration=2.0),
        ],
    )
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id="cut-nosup", supervisions=[]),
        fastcopy(cut, id="cut-norec", recording=None),
        fastcopy(cut, id="cut-nofeat", features=None),
        cut.pad(duration=30.0, direction="left"),
        cut.pad(duration=30.0, direction="right"),
        cut.pad(duration=30.0, direction="both"),
        cut.mix(cut, offset_other_by=5.0, snr=8),
    ])
Ejemplo n.º 8
0
def cut_set():
    cut = Cut(id='cut-1',
              start=0.0,
              duration=10.0,
              channel=0,
              features=Features(
                  type='fbank',
                  num_frames=100,
                  num_features=40,
                  frame_shift=0.01,
                  sampling_rate=16000,
                  start=0.0,
                  duration=10.0,
                  storage_type='lilcom',
                  storage_path='irrelevant',
                  storage_key='irrelevant',
              ),
              recording=Recording(id='rec-1',
                                  sampling_rate=16000,
                                  num_samples=160000,
                                  duration=10.0,
                                  sources=[
                                      AudioSource(type='file',
                                                  channels=[0],
                                                  source='irrelevant')
                                  ]),
              supervisions=[
                  SupervisionSegment(id='sup-1',
                                     recording_id='irrelevant',
                                     start=0.5,
                                     duration=6.0),
                  SupervisionSegment(id='sup-2',
                                     recording_id='irrelevant',
                                     start=7.0,
                                     duration=2.0)
              ])
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id='cut-nosup', supervisions=[]),
        fastcopy(cut, id='cut-norec', recording=None),
        fastcopy(cut, id='cut-nofeat', features=None),
        cut.pad(duration=30.0, direction='left'),
        cut.pad(duration=30.0, direction='right'),
        cut.pad(duration=30.0, direction='both'),
        cut.mix(cut, offset_other_by=5.0, snr=8)
    ])
Ejemplo n.º 9
0
def feature_set():
    return FeatureSet(features=[
        Features(
            recording_id="irrelevant",
            channels=0,
            start=0.0,
            duration=20.0,
            type="fbank",
            num_frames=2000,
            num_features=20,
            frame_shift=0.01,
            sampling_rate=16000,
            storage_type="lilcom",
            storage_path="/irrelevant/",
            storage_key="path.llc",
        )
    ])
Ejemplo n.º 10
0
def deserialize_item(data: dict) -> Any:
    # Figures out what type of manifest is being decoded with some heuristics
    # and returns a Lhotse manifest object rather than a raw dict.
    from lhotse import Cut, Features, Recording, SupervisionSegment
    from lhotse.cut import MixedCut
    data = arr2list_recursive(data)
    if 'sources' in data:
        return Recording.from_dict(data)
    if 'num_features' in data:
        return Features.from_dict(data)
    if 'type' not in data:
        return SupervisionSegment.from_dict(data)
    cut_type = data.pop('type')
    if cut_type == 'Cut':
        return Cut.from_dict(data)
    if cut_type == 'MixedCut':
        return MixedCut.from_dict(data)
    raise ValueError(f"Unexpected cut type during deserialization: '{cut_type}'")
Ejemplo n.º 11
0
def cut_with_relative_paths():
    return MonoCut(
        "cut",
        0,
        10,
        0,
        features=Features(
            type="fbank",
            num_frames=1000,
            num_features=40,
            sampling_rate=8000,
            storage_type="lilcom_files",
            storage_path="storage_dir",
            storage_key="feats.llc",
            start=0,
            duration=10,
            frame_shift=0.01,
        ),
        recording=Recording("rec", [AudioSource("file", [0], "audio.wav")],
                            8000, 80000, 10.0),
    )
Ejemplo n.º 12
0
def deserialize_item(data: dict) -> Any:
    # Figures out what type of manifest is being decoded with some heuristics
    # and returns a Lhotse manifest object rather than a raw dict.
    from lhotse import MonoCut, Features, Recording, SupervisionSegment
    from lhotse.cut import MixedCut
    data = arr2list_recursive(data)
    if 'sources' in data:
        return Recording.from_dict(data)
    if 'num_features' in data:
        return Features.from_dict(data)
    if 'type' not in data:
        return SupervisionSegment.from_dict(data)
    cut_type = data.pop('type')
    if cut_type == 'MonoCut':
        return MonoCut.from_dict(data)
    if cut_type == 'Cut':
        warnings.warn('Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. '
                      'Please re-generate it with Lhotse v0.8 as it might stop working in a future version '
                      '(using manifest.from_file() and then manifest.to_file() should be sufficient).')
        return MonoCut.from_dict(data)
    if cut_type == 'MixedCut':
        return MixedCut.from_dict(data)
    raise ValueError(f"Unexpected cut type during deserialization: '{cut_type}'")
Ejemplo n.º 13
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Ejemplo n.º 14
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1]
                    if path_or_cmd.endswith("|")
                    else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id], sampling_rate),
            duration=durations[recording_id],
        )
        for recording_id, path_or_cmd in recordings.items()
    )

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [sup_string.strip().split() for sup_string in f]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate
                ),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            )
            for segment_id, recording_id, start, end in supervision_segments
        )

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldi_native_io"):
        if frame_shift is not None:
            import kaldi_native_io
            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldi_native_io",
                    num_frames=mat_shape.num_rows,
                    num_features=mat_shape.num_cols,
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat_shape.num_rows * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[utt_id].recording_id
                    if supervision_set is not None
                    else utt_id,
                    channels=0,
                )
                for utt_id, mat_shape in kaldi_native_io.SequentialMatrixShapeReader(
                    f"scp:{feats_scp}"
                )
            )
        else:
            warnings.warn(
                "Failed to import Kaldi 'feats.scp' to Lhotse: "
                "frame_shift must be not None. "
                "Feature import omitted."
            )

    return recording_set, supervision_set, feature_set