Esempio n. 1
0
def test_write_read_multiark_sequential(tmpdir, endian):
    path = tmpdir.mkdir("test")

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {"Ï,é,à": a, "あいうえお": b}

    kaldiio.save_ark(
        path.join("a.ark").strpath,
        origin,
        scp=path.join("b.scp").strpath,
        endian=endian,
    )

    c = np.random.rand(1000, 120).astype(np.float32)
    d = np.random.rand(10, 120).astype(np.float32)
    origin.update({"c": c, "d": d})
    with io.open(path.join("b.scp").strpath, "a", encoding="utf-8") as f:
        kaldiio.save_ark(path.join("b.ark").strpath, origin, scp=f, endian=endian)

    d5 = {
        k: v
        for k, v in kaldiio.load_scp_sequential(
            path.join("b.scp").strpath, endian=endian
        )
    }
    _compare(d5, origin)
Esempio n. 2
0
def test_write_read_multiark_sequential(tmpdir, endian):
    path = tmpdir.mkdir('test')

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {u'Ï,é,à': a, u'あいうえお': b}

    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath,
                     endian=endian)

    c = np.random.rand(1000, 120).astype(np.float32)
    d = np.random.rand(10, 120).astype(np.float32)
    origin.update({u'c': c, u'd': d})
    with io.open(path.join('b.scp').strpath, 'a', encoding='utf-8') as f:
        kaldiio.save_ark(path.join('b.ark').strpath,
                         origin,
                         scp=f,
                         endian=endian)

    d5 = {
        k: v
        for k, v in kaldiio.load_scp_sequential(path.join('b.scp').strpath,
                                                endian=endian)
    }
    _compare(d5, origin)
Esempio n. 3
0
def get_cmvn_dict(cmvnscp):
    cmvn_stats_dict = {}
    cmvn_reader = kaldiio.load_scp_sequential(cmvnscp)

    for spkid, stats in cmvn_reader:
        cmvn_stats_dict[spkid] = stats

    return cmvn_stats_dict
Esempio n. 4
0
def test_write_read_sequential(tmpdir, endian):
    path = tmpdir.mkdir('test')

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {u'Ï,é,à': a, u'あいうえお': b}
    kaldiio.save_ark(path.join('a.ark').strpath,
                     origin,
                     scp=path.join('b.scp').strpath,
                     endian=endian)

    d5 = {
        k: v
        for k, v in kaldiio.load_scp_sequential(path.join('b.scp').strpath,
                                                endian=endian)
    }
    _compare(d5, origin)
Esempio n. 5
0
def test_write_read_sequential(tmpdir, endian):
    path = tmpdir.mkdir("test")

    a = np.random.rand(1000, 120).astype(np.float32)
    b = np.random.rand(10, 120).astype(np.float32)
    origin = {"Ï,é,à": a, "あいうえお": b}
    kaldiio.save_ark(
        path.join("a.ark").strpath,
        origin,
        scp=path.join("b.scp").strpath,
        endian=endian,
    )

    d5 = {
        k: v
        for k, v in kaldiio.load_scp_sequential(
            path.join("b.scp").strpath, endian=endian
        )
    }
    _compare(d5, origin)
Esempio n. 6
0
def main():
    args = get_parser().parse_args()
    feat = ASRFeature(odim=args.dim, n_subsample=0)
    mean = torch.zeros(args.dim)
    mean2 = torch.zeros(args.dim)
    tsum = 0
    for scp in args.scp:
        n = len(kaldiio.load_scp(scp))
        d = kaldiio.load_scp_sequential(scp)
        for i, (k, v) in enumerate(d):
            print(f"progress: {i} / {n}")
            # TODO(karita) batch processing
            sr, wav = v
            wav = torch.from_numpy(wav2float(wav))
            fbank = feat(wav.unsqueeze(0))[0]
            t = fbank.shape[0]
            mean = (t * fbank.sum(0) + tsum * mean) / (t + tsum)
            mean2 = (t * (fbank * fbank).sum(0) + tsum * mean2) / (t + tsum)

    var = mean * mean - mean2
    d = dict(mean=mean, stddev=var.sqrt(), var=var)
    print(d)
    torch.save(d, args.out)
Esempio n. 7
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Esempio n. 8
0
def main():
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    logging.basicConfig(level=logging.INFO, format=logfmt)
    logging.info(get_commandline_args())

    parser = argparse.ArgumentParser(
        description='Create waves list from "wav.scp"',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("scp")
    parser.add_argument("outdir")
    parser.add_argument(
        "--name",
        default="wav",
        help="Specify the prefix word of output file name "
        'such as "wav.scp"',
    )
    parser.add_argument("--segments", default=None)
    parser.add_argument(
        "--fs",
        type=humanfriendly_or_none,
        default=None,
        help="If the sampling rate specified, "
        "Change the sampling rate.",
    )
    parser.add_argument("--audio-format", default="wav")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("--ref-channels", default=None, type=str2int_tuple)
    group.add_argument("--utt2ref-channels", default=None, type=str)
    args = parser.parse_args()

    out_num_samples = Path(args.outdir) / f"utt2num_samples"

    if args.ref_channels is not None:

        def utt2ref_channels(x) -> Tuple[int, ...]:
            return args.ref_channels

    elif args.utt2ref_channels is not None:
        utt2ref_channels_dict = read_2column_text(args.utt2ref_channels)

        def utt2ref_channels(x, d=utt2ref_channels_dict) -> Tuple[int, ...]:
            chs_str = d[x]
            return tuple(map(int, chs_str.split()))

    else:
        utt2ref_channels = None

    if args.segments is not None:
        # Note: kaldiio supports only wav-pcm-int16le file.
        loader = kaldiio.load_scp_sequential(args.scp, segments=args.segments)
        with SoundScpWriter(
                args.outdir,
                Path(args.outdir) / f"{args.name}.scp",
                format=args.audio_format,
        ) as writer, out_num_samples.open("w") as fnum_samples:
            for uttid, (rate, wave) in tqdm(loader):
                # wave: (Time,) or (Time, Nmic)
                if wave.ndim == 2 and utt2ref_channels is not None:
                    wave = wave[:, utt2ref_channels(uttid)]

                if args.fs is not None and args.fs != rate:
                    # FIXME(kamo): To use sox?
                    wave = resampy.resample(wave.astype(np.float64),
                                            rate,
                                            args.fs,
                                            axis=0)
                    wave = wave.astype(np.int16)
                    rate = args.fs
                writer[uttid] = rate, wave
                fnum_samples.write(f"{uttid} {len(wave)}\n")
    else:
        wavdir = Path(args.outdir) / f"data_{args.name}"
        wavdir.mkdir(parents=True, exist_ok=True)
        out_wavscp = Path(args.outdir) / f"{args.name}.scp"

        with Path(args.scp).open("r") as fscp, out_wavscp.open(
                "w") as fout, out_num_samples.open("w") as fnum_samples:
            for line in tqdm(fscp):
                uttid, wavpath = line.strip().split(None, 1)

                if wavpath.endswith("|"):
                    # Streaming input e.g. cat a.wav |
                    with kaldiio.open_like_kaldi(wavpath, "rb") as f:
                        with BytesIO(f.read()) as g:
                            wave, rate = soundfile.read(g, dtype=np.int16)
                            if wave.ndim == 2 and utt2ref_channels is not None:
                                wave = wave[:, utt2ref_channels(uttid)]

                        if args.fs is not None and args.fs != rate:
                            # FIXME(kamo): To use sox?
                            wave = resampy.resample(wave.astype(np.float64),
                                                    rate,
                                                    args.fs,
                                                    axis=0)
                            wave = wave.astype(np.int16)
                            rate = args.fs

                        owavpath = str(wavdir / f"{uttid}.{args.audio_format}")
                        soundfile.write(owavpath, wave, rate)
                        fout.write(f"{uttid} {owavpath}\n")
                else:
                    wave, rate = soundfile.read(wavpath, dtype=np.int16)
                    if wave.ndim == 2 and utt2ref_channels is not None:
                        wave = wave[:, utt2ref_channels(uttid)]
                        save_asis = False

                    elif Path(wavpath).suffix == "." + args.audio_format and (
                            args.fs is None or args.fs == rate):
                        save_asis = True

                    else:
                        save_asis = False

                    if save_asis:
                        # Neither --segments nor --fs are specified and
                        # the line doesn't end with "|",
                        # i.e. not using unix-pipe,
                        # only in this case,
                        # just using the original file as is.
                        fout.write(f"{uttid} {wavpath}\n")
                    else:
                        if args.fs is not None and args.fs != rate:
                            # FIXME(kamo): To use sox?
                            wave = resampy.resample(wave.astype(np.float64),
                                                    rate,
                                                    args.fs,
                                                    axis=0)
                            wave = wave.astype(np.int16)
                            rate = args.fs

                        owavpath = str(wavdir / f"{uttid}.{args.audio_format}")
                        soundfile.write(owavpath, wave, rate)
                        fout.write(f"{uttid} {owavpath}\n")
                fnum_samples.write(f"{uttid} {len(wave)}\n")
Esempio n. 9
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1] if path_or_cmd.
                    endswith("|") else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id],
                                            sampling_rate),
            duration=durations[recording_id],
        ) for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [
                sup_string.strip().split() for sup_string in f
            ]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            ) for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldiio"):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldiio",
                    num_frames=mat.shape[0],
                    num_features=mat.shape[1],
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat.shape[0] * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[utt_id].
                    recording_id if supervision_set is not None else utt_id,
                    channels=0,
                )
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: "
                          "frame_shift must be not None. "
                          "Feature import omitted.")

    return recording_set, supervision_set, feature_set
Esempio n. 10
0
def dataloader(alignments, features, batch_size, shuffle=False):
    """Loading alignments and features from kaldi files.

    Args:
        alignments (str): Absolute path of alignments file (Obtained from T2-GMM-HMM task).
        features (str): Absolute path of feats.scp file (Obtained from T2-GMM-HMM task).
        batch_size (int): batch size.
        shuffle (bool): shuffle the data if needed.

    Returns:
        id, data, target (generator): batched features and alignments with utt_id.

    """
    # first load aligments and save them in the dictionary
    align_reader = kaldiio.load_ark(alignments)
    align = {}
    for (utt_id, utt_align) in align_reader:
        align[utt_id] = utt_align

    # randomly read features and generate batch
    if shuffle:
        feats_reader = kaldiio.load_scp(features)
        data = None
    # sequentially read features and generate batch
    else:
        feats_reader = kaldiio.load_scp_sequential(features)
        data = (None, None)

    batch_idx = 0
    feats_buffer = []
    align_buffer = []
    id_buffer = []
    for data in feats_reader:
        if shuffle:
            utt_id = data
            utt_feat = feats_reader[utt_id]
        else:
            utt_id, utt_feat = data
        assert align[utt_id] is not None
        assert align[utt_id].shape[0] == utt_feat.shape[0]

        align_buffer.append(align[utt_id][:, np.newaxis])
        feats_buffer.append(utt_feat)
        id_buffer.append(utt_id)
        batch_idx += 1

        if batch_idx == batch_size:
            # bacth_size x max_feature_len x feature_dim
            data = pad_list(feats_buffer, pad_value=0.)
            # batch_szie x max_target_len
            target = pad_list(align_buffer, pad_value=-1)[:, :, 0]
            yield id_buffer, data, target

            batch_idx = 0
            feats_buffer = []
            align_buffer = []
            id_buffer = []

    # last batch
    if len(feats_buffer) > 0:
        # bacth_size x max_feature_len x feature_dim
        data = pad_list(feats_buffer, pad_value=0.)
        # batch_szie x max_target_len
        target = pad_list(align_buffer, pad_value=-1)[:, :, 0]
        yield id_buffer, data, target