def test_SoundScpWriter(tmp_path: Path): audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) with SoundScpWriter(tmp_path, tmp_path / "wav.scp", dtype=np.int16) as writer: writer["abc"] = 16, audio1 writer["def"] = 16, audio2 # Unsupported dimension with pytest.raises(RuntimeError): y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16) writer["ghi"] = 16, y target = SoundScpReader(tmp_path / "wav.scp", normalize=False, dtype=np.int16) desired = {"abc": (16, audio1), "def": (16, audio2)} for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d) assert writer.get_path("abc") == str(tmp_path / "abc.wav") assert writer.get_path("def") == str(tmp_path / "def.wav")
def test_SoundScpWriter_normalize(tmp_path: Path): audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) audio1 = audio1.astype(np.float64) / (np.iinfo(np.int16).max + 1) audio2 = audio2.astype(np.float64) / (np.iinfo(np.int16).max + 1) with SoundScpWriter(tmp_path, tmp_path / "wav.scp", dtype=np.int16) as writer: writer["abc"] = 16, audio1 writer["def"] = 16, audio2 # Unsupported dimension with pytest.raises(RuntimeError): y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16) writer["ghi"] = 16, y target = SoundScpReader(tmp_path / "wav.scp", normalize=True, dtype=np.float64) desired = {"abc": (16, audio1), "def": (16, audio2)} for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d)
def main(): logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" logging.basicConfig(level=logging.INFO, format=logfmt) logging.info(get_commandline_args()) parser = argparse.ArgumentParser( description='Create waves list from "wav.scp"', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("scp") parser.add_argument("outdir") parser.add_argument( "--name", default="wav", help="Specify the prefix word of output file name " 'such as "wav.scp"', ) parser.add_argument("--segments", default=None) parser.add_argument( "--fs", type=humanfriendly_or_none, default=None, help="If the sampling rate specified, " "Change the sampling rate.", ) parser.add_argument("--audio-format", default="wav") group = parser.add_mutually_exclusive_group() group.add_argument("--ref-channels", default=None, type=str2int_tuple) group.add_argument("--utt2ref-channels", default=None, type=str) args = parser.parse_args() out_num_samples = Path(args.outdir) / f"utt2num_samples" if args.ref_channels is not None: def utt2ref_channels(x) -> Tuple[int, ...]: return args.ref_channels elif args.utt2ref_channels is not None: utt2ref_channels_dict = read_2column_text(args.utt2ref_channels) def utt2ref_channels(x, d=utt2ref_channels_dict) -> Tuple[int, ...]: chs_str = d[x] return tuple(map(int, chs_str.split())) else: utt2ref_channels = None if args.segments is not None: # Note: kaldiio supports only wav-pcm-int16le file. loader = kaldiio.load_scp_sequential(args.scp, segments=args.segments) with SoundScpWriter( args.outdir, Path(args.outdir) / f"{args.name}.scp", format=args.audio_format, ) as writer, out_num_samples.open("w") as fnum_samples: for uttid, (rate, wave) in tqdm(loader): # wave: (Time,) or (Time, Nmic) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs writer[uttid] = rate, wave fnum_samples.write(f"{uttid} {len(wave)}\n") else: wavdir = Path(args.outdir) / f"data_{args.name}" wavdir.mkdir(parents=True, exist_ok=True) out_wavscp = Path(args.outdir) / f"{args.name}.scp" with Path(args.scp).open("r") as fscp, out_wavscp.open( "w") as fout, out_num_samples.open("w") as fnum_samples: for line in tqdm(fscp): uttid, wavpath = line.strip().split(None, 1) if wavpath.endswith("|"): # Streaming input e.g. cat a.wav | with kaldiio.open_like_kaldi(wavpath, "rb") as f: with BytesIO(f.read()) as g: wave, rate = soundfile.read(g, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") else: wave, rate = soundfile.read(wavpath, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] save_asis = False elif Path(wavpath).suffix == "." + args.audio_format and ( args.fs is None or args.fs == rate): save_asis = True else: save_asis = False if save_asis: # Neither --segments nor --fs are specified and # the line doesn't end with "|", # i.e. not using unix-pipe, # only in this case, # just using the original file as is. fout.write(f"{uttid} {wavpath}\n") else: if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") fnum_samples.write(f"{uttid} {len(wave)}\n")
def sound_scp(tmp_path): p = tmp_path / "wav.scp" w = SoundScpWriter(tmp_path / "data", p) w["a"] = 16000, np.random.randint(-100, 100, (160000, ), dtype=np.int16) w["b"] = 16000, np.random.randint(-100, 100, (80000, ), dtype=np.int16) return str(p)