def test_write_read_multiark_sequential(tmpdir, endian): path = tmpdir.mkdir("test") a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {"Ï,é,à": a, "あいうえお": b} kaldiio.save_ark( path.join("a.ark").strpath, origin, scp=path.join("b.scp").strpath, endian=endian, ) c = np.random.rand(1000, 120).astype(np.float32) d = np.random.rand(10, 120).astype(np.float32) origin.update({"c": c, "d": d}) with io.open(path.join("b.scp").strpath, "a", encoding="utf-8") as f: kaldiio.save_ark(path.join("b.ark").strpath, origin, scp=f, endian=endian) d5 = { k: v for k, v in kaldiio.load_scp_sequential( path.join("b.scp").strpath, endian=endian ) } _compare(d5, origin)
def test_write_read_multiark_sequential(tmpdir, endian): path = tmpdir.mkdir('test') a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {u'Ï,é,à': a, u'あいうえお': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath, endian=endian) c = np.random.rand(1000, 120).astype(np.float32) d = np.random.rand(10, 120).astype(np.float32) origin.update({u'c': c, u'd': d}) with io.open(path.join('b.scp').strpath, 'a', encoding='utf-8') as f: kaldiio.save_ark(path.join('b.ark').strpath, origin, scp=f, endian=endian) d5 = { k: v for k, v in kaldiio.load_scp_sequential(path.join('b.scp').strpath, endian=endian) } _compare(d5, origin)
def get_cmvn_dict(cmvnscp): cmvn_stats_dict = {} cmvn_reader = kaldiio.load_scp_sequential(cmvnscp) for spkid, stats in cmvn_reader: cmvn_stats_dict[spkid] = stats return cmvn_stats_dict
def test_write_read_sequential(tmpdir, endian): path = tmpdir.mkdir('test') a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {u'Ï,é,à': a, u'あいうえお': b} kaldiio.save_ark(path.join('a.ark').strpath, origin, scp=path.join('b.scp').strpath, endian=endian) d5 = { k: v for k, v in kaldiio.load_scp_sequential(path.join('b.scp').strpath, endian=endian) } _compare(d5, origin)
def test_write_read_sequential(tmpdir, endian): path = tmpdir.mkdir("test") a = np.random.rand(1000, 120).astype(np.float32) b = np.random.rand(10, 120).astype(np.float32) origin = {"Ï,é,à": a, "あいうえお": b} kaldiio.save_ark( path.join("a.ark").strpath, origin, scp=path.join("b.scp").strpath, endian=endian, ) d5 = { k: v for k, v in kaldiio.load_scp_sequential( path.join("b.scp").strpath, endian=endian ) } _compare(d5, origin)
def main(): args = get_parser().parse_args() feat = ASRFeature(odim=args.dim, n_subsample=0) mean = torch.zeros(args.dim) mean2 = torch.zeros(args.dim) tsum = 0 for scp in args.scp: n = len(kaldiio.load_scp(scp)) d = kaldiio.load_scp_sequential(scp) for i, (k, v) in enumerate(d): print(f"progress: {i} / {n}") # TODO(karita) batch processing sr, wav = v wav = torch.from_numpy(wav2float(wav)) fbank = feat(wav.unsqueeze(0))[0] t = fbank.shape[0] mean = (t * fbank.sum(0) + tsum * mean) / (t + tsum) mean2 = (t * (fbank * fbank).sum(0) + tsum * mean2) / (t + tsum) var = mean * mean - mean2 d = dict(mean=mean, stddev=var.sqrt(), var=var) print(d) torch.save(d, args.out)
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) reco2dur = path / 'reco2dur' if not reco2dur.is_file(): raise ValueError( f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>" ) with reco2dur.open() as f: for line in f: recording_id, dur = line.strip().split() durations[recording_id] = float(dur) recording_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=int(durations[recording_id] * sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / 'segments' if segments.is_file(): with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / 'feats.scp' if feats_scp.exists() and is_module_available('kaldiio'): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features(type='kaldiio', num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: " f"frame_shift must be not None. " f"Feature import omitted.") return recording_set, supervision_set, feature_set
def main(): logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" logging.basicConfig(level=logging.INFO, format=logfmt) logging.info(get_commandline_args()) parser = argparse.ArgumentParser( description='Create waves list from "wav.scp"', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("scp") parser.add_argument("outdir") parser.add_argument( "--name", default="wav", help="Specify the prefix word of output file name " 'such as "wav.scp"', ) parser.add_argument("--segments", default=None) parser.add_argument( "--fs", type=humanfriendly_or_none, default=None, help="If the sampling rate specified, " "Change the sampling rate.", ) parser.add_argument("--audio-format", default="wav") group = parser.add_mutually_exclusive_group() group.add_argument("--ref-channels", default=None, type=str2int_tuple) group.add_argument("--utt2ref-channels", default=None, type=str) args = parser.parse_args() out_num_samples = Path(args.outdir) / f"utt2num_samples" if args.ref_channels is not None: def utt2ref_channels(x) -> Tuple[int, ...]: return args.ref_channels elif args.utt2ref_channels is not None: utt2ref_channels_dict = read_2column_text(args.utt2ref_channels) def utt2ref_channels(x, d=utt2ref_channels_dict) -> Tuple[int, ...]: chs_str = d[x] return tuple(map(int, chs_str.split())) else: utt2ref_channels = None if args.segments is not None: # Note: kaldiio supports only wav-pcm-int16le file. loader = kaldiio.load_scp_sequential(args.scp, segments=args.segments) with SoundScpWriter( args.outdir, Path(args.outdir) / f"{args.name}.scp", format=args.audio_format, ) as writer, out_num_samples.open("w") as fnum_samples: for uttid, (rate, wave) in tqdm(loader): # wave: (Time,) or (Time, Nmic) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs writer[uttid] = rate, wave fnum_samples.write(f"{uttid} {len(wave)}\n") else: wavdir = Path(args.outdir) / f"data_{args.name}" wavdir.mkdir(parents=True, exist_ok=True) out_wavscp = Path(args.outdir) / f"{args.name}.scp" with Path(args.scp).open("r") as fscp, out_wavscp.open( "w") as fout, out_num_samples.open("w") as fnum_samples: for line in tqdm(fscp): uttid, wavpath = line.strip().split(None, 1) if wavpath.endswith("|"): # Streaming input e.g. cat a.wav | with kaldiio.open_like_kaldi(wavpath, "rb") as f: with BytesIO(f.read()) as g: wave, rate = soundfile.read(g, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") else: wave, rate = soundfile.read(wavpath, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] save_asis = False elif Path(wavpath).suffix == "." + args.audio_format and ( args.fs is None or args.fs == rate): save_asis = True else: save_asis = False if save_asis: # Neither --segments nor --fs are specified and # the line doesn't end with "|", # i.e. not using unix-pipe, # only in this case, # just using the original file as is. fout.write(f"{uttid} {wavpath}\n") else: if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") fnum_samples.write(f"{uttid} {len(wave)}\n")
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [ sup_string.strip().split() for sup_string in f ] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldiio"): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldiio", num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id]. recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted.") return recording_set, supervision_set, feature_set
def dataloader(alignments, features, batch_size, shuffle=False): """Loading alignments and features from kaldi files. Args: alignments (str): Absolute path of alignments file (Obtained from T2-GMM-HMM task). features (str): Absolute path of feats.scp file (Obtained from T2-GMM-HMM task). batch_size (int): batch size. shuffle (bool): shuffle the data if needed. Returns: id, data, target (generator): batched features and alignments with utt_id. """ # first load aligments and save them in the dictionary align_reader = kaldiio.load_ark(alignments) align = {} for (utt_id, utt_align) in align_reader: align[utt_id] = utt_align # randomly read features and generate batch if shuffle: feats_reader = kaldiio.load_scp(features) data = None # sequentially read features and generate batch else: feats_reader = kaldiio.load_scp_sequential(features) data = (None, None) batch_idx = 0 feats_buffer = [] align_buffer = [] id_buffer = [] for data in feats_reader: if shuffle: utt_id = data utt_feat = feats_reader[utt_id] else: utt_id, utt_feat = data assert align[utt_id] is not None assert align[utt_id].shape[0] == utt_feat.shape[0] align_buffer.append(align[utt_id][:, np.newaxis]) feats_buffer.append(utt_feat) id_buffer.append(utt_id) batch_idx += 1 if batch_idx == batch_size: # bacth_size x max_feature_len x feature_dim data = pad_list(feats_buffer, pad_value=0.) # batch_szie x max_target_len target = pad_list(align_buffer, pad_value=-1)[:, :, 0] yield id_buffer, data, target batch_idx = 0 feats_buffer = [] align_buffer = [] id_buffer = [] # last batch if len(feats_buffer) > 0: # bacth_size x max_feature_len x feature_dim data = pad_list(feats_buffer, pad_value=0.) # batch_szie x max_target_len target = pad_list(align_buffer, pad_value=-1)[:, :, 0] yield id_buffer, data, target