def __init__(self, batch_size: int, key_file: str, drop_last: bool = False): assert check_argument_types() assert batch_size > 0 self.batch_size = batch_size self.key_file = key_file self.drop_last = drop_last # utt2shape: # uttA <anything is o.k> # uttB <anything is o.k> utt2any = read_2column_text(key_file) if len(utt2any) == 0: logging.warning(f"{key_file} is empty") # In this case the, the first column in only used keys = list(utt2any) if len(keys) == 0: raise RuntimeError(f"0 lines found: {key_file}") # Apply max(, 1) to avoid 0-batches N = max(len(keys) // batch_size, 1) if not self.drop_last: # Split keys evenly as possible as. Note that If N != 1, # the these batches always have size of batch_size at minimum. self.batch_list = [ keys[i * len(keys) // N:(i + 1) * len(keys) // N] for i in range(N) ] else: self.batch_list = [ tuple(keys[i * batch_size:(i + 1) * batch_size]) for i in range(N) ]
def test_read_2column_text(tmp_path: Path): p = tmp_path / "dummy.scp" with p.open("w") as f: f.write(f"abc /some/path/a.wav\n") f.write(f"def /some/path/b.wav\n") d = read_2column_text(p) assert d == {"abc": "/some/path/a.wav", "def": "/some/path/b.wav"}
def main(): logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" logging.basicConfig(level=logging.INFO, format=logfmt) logging.info(get_commandline_args()) parser = argparse.ArgumentParser( description='Create waves list from "wav.scp"', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("scp") parser.add_argument("outdir") parser.add_argument( "--name", default="wav", help="Specify the prefix word of output file name " 'such as "wav.scp"', ) parser.add_argument("--segments", default=None) parser.add_argument( "--fs", type=humanfriendly_or_none, default=None, help="If the sampling rate specified, " "Change the sampling rate.", ) parser.add_argument("--audio-format", default="wav") group = parser.add_mutually_exclusive_group() group.add_argument("--ref-channels", default=None, type=str2int_tuple) group.add_argument("--utt2ref-channels", default=None, type=str) args = parser.parse_args() out_num_samples = Path(args.outdir) / f"utt2num_samples" if args.ref_channels is not None: def utt2ref_channels(x) -> Tuple[int, ...]: return args.ref_channels elif args.utt2ref_channels is not None: utt2ref_channels_dict = read_2column_text(args.utt2ref_channels) def utt2ref_channels(x, d=utt2ref_channels_dict) -> Tuple[int, ...]: chs_str = d[x] return tuple(map(int, chs_str.split())) else: utt2ref_channels = None if args.segments is not None: # Note: kaldiio supports only wav-pcm-int16le file. loader = kaldiio.load_scp_sequential(args.scp, segments=args.segments) with SoundScpWriter( args.outdir, Path(args.outdir) / f"{args.name}.scp", format=args.audio_format, ) as writer, out_num_samples.open("w") as fnum_samples: for uttid, (rate, wave) in tqdm(loader): # wave: (Time,) or (Time, Nmic) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs writer[uttid] = rate, wave fnum_samples.write(f"{uttid} {len(wave)}\n") else: wavdir = Path(args.outdir) / f"data_{args.name}" wavdir.mkdir(parents=True, exist_ok=True) out_wavscp = Path(args.outdir) / f"{args.name}.scp" with Path(args.scp).open("r") as fscp, out_wavscp.open( "w") as fout, out_num_samples.open("w") as fnum_samples: for line in tqdm(fscp): uttid, wavpath = line.strip().split(None, 1) if wavpath.endswith("|"): # Streaming input e.g. cat a.wav | with kaldiio.open_like_kaldi(wavpath, "rb") as f: with BytesIO(f.read()) as g: wave, rate = soundfile.read(g, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") else: wave, rate = soundfile.read(wavpath, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] save_asis = False elif Path(wavpath).suffix == "." + args.audio_format and ( args.fs is None or args.fs == rate): save_asis = True else: save_asis = False if save_asis: # Neither --segments nor --fs are specified and # the line doesn't end with "|", # i.e. not using unix-pipe, # only in this case, # just using the original file as is. fout.write(f"{uttid} {wavpath}\n") else: if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") fnum_samples.write(f"{uttid} {len(wave)}\n")