def test_SoundScpWriter_normalize(tmp_path: Path): audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) audio1 = audio1.astype(np.float64) / (np.iinfo(np.int16).max + 1) audio2 = audio2.astype(np.float64) / (np.iinfo(np.int16).max + 1) with SoundScpWriter(tmp_path, tmp_path / "wav.scp", dtype=np.int16) as writer: writer["abc"] = 16, audio1 writer["def"] = 16, audio2 # Unsupported dimension with pytest.raises(RuntimeError): y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16) writer["ghi"] = 16, y target = SoundScpReader(tmp_path / "wav.scp", normalize=True, dtype=np.float64) desired = {"abc": (16, audio1), "def": (16, audio2)} for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d)
def test_SoundScpWriter(tmp_path: Path): audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) with SoundScpWriter(tmp_path, tmp_path / "wav.scp", dtype=np.int16) as writer: writer["abc"] = 16, audio1 writer["def"] = 16, audio2 # Unsupported dimension with pytest.raises(RuntimeError): y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16) writer["ghi"] = 16, y target = SoundScpReader(tmp_path / "wav.scp", normalize=False, dtype=np.int16) desired = {"abc": (16, audio1), "def": (16, audio2)} for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d) assert writer.get_path("abc") == str(tmp_path / "abc.wav") assert writer.get_path("def") == str(tmp_path / "def.wav")
def sound_scp(tmp_path): p = tmp_path / "wav.scp" w = SoundScpWriter(tmp_path / "data", p) w["a"] = 16000, np.random.randint(-100, 100, (160000, ), dtype=np.int16) w["b"] = 16000, np.random.randint(-100, 100, (80000, ), dtype=np.int16) return str(p)
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], inference_config: Optional[str], allow_variable_data_keys: bool, segment_size: Optional[float], hop_size: Optional[float], normalize_segment_scale: bool, show_progressbar: bool, ref_channel: Optional[int], normalize_output_wav: bool, enh_s2t_task: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build separate_speech separate_speech_kwargs = dict( train_config=train_config, model_file=model_file, inference_config=inference_config, segment_size=segment_size, hop_size=hop_size, normalize_segment_scale=normalize_segment_scale, show_progressbar=show_progressbar, ref_channel=ref_channel, normalize_output_wav=normalize_output_wav, device=device, dtype=dtype, enh_s2t_task=enh_s2t_task, ) separate_speech = SeparateSpeech.from_pretrained( model_tag=model_tag, **separate_speech_kwargs, ) # 3. Build data-iterator loader = EnhancementTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=EnhancementTask.build_preprocess_fn( separate_speech.enh_train_args, False), collate_fn=EnhancementTask.build_collate_fn( separate_speech.enh_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop output_dir = Path(output_dir).expanduser().resolve() writers = [] for i in range(separate_speech.num_spk): writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for i, (keys, batch) in enumerate(loader): logging.info(f"[{i}] Enhancing {keys}") assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} waves = separate_speech(**batch) for (spk, w) in enumerate(waves): for b in range(batch_size): writers[spk][keys[b]] = fs, w[b] for writer in writers: writer.close()
def main(): logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" logging.basicConfig(level=logging.INFO, format=logfmt) logging.info(get_commandline_args()) parser = argparse.ArgumentParser( description='Create waves list from "wav.scp"', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("scp") parser.add_argument("outdir") parser.add_argument( "--name", default="wav", help="Specify the prefix word of output file name " 'such as "wav.scp"', ) parser.add_argument("--segments", default=None) parser.add_argument( "--fs", type=humanfriendly_or_none, default=None, help="If the sampling rate specified, " "Change the sampling rate.", ) parser.add_argument("--audio-format", default="wav") group = parser.add_mutually_exclusive_group() group.add_argument("--ref-channels", default=None, type=str2int_tuple) group.add_argument("--utt2ref-channels", default=None, type=str) args = parser.parse_args() out_num_samples = Path(args.outdir) / f"utt2num_samples" if args.ref_channels is not None: def utt2ref_channels(x) -> Tuple[int, ...]: return args.ref_channels elif args.utt2ref_channels is not None: utt2ref_channels_dict = read_2column_text(args.utt2ref_channels) def utt2ref_channels(x, d=utt2ref_channels_dict) -> Tuple[int, ...]: chs_str = d[x] return tuple(map(int, chs_str.split())) else: utt2ref_channels = None Path(args.outdir).mkdir(parents=True, exist_ok=True) out_wavscp = Path(args.outdir) / f"{args.name}.scp" if args.segments is not None: # Note: kaldiio supports only wav-pcm-int16le file. loader = kaldiio.load_scp_sequential(args.scp, segments=args.segments) if args.audio_format.endswith("ark"): fark = open(Path(args.outdir) / f"data_{args.name}.ark", "wb") fscp = out_wavscp.open("w") else: writer = SoundScpWriter( args.outdir, out_wavscp, format=args.audio_format, ) with out_num_samples.open("w") as fnum_samples: for uttid, (rate, wave) in tqdm(loader): # wave: (Time,) or (Time, Nmic) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs if args.audio_format.endswith("ark"): if "flac" in args.audio_format: suf = "flac" elif "wav" in args.audio_format: suf = "wav" else: raise RuntimeError("wav.ark or flac") # NOTE(kamo): Using extended ark format style here. # This format is incompatible with Kaldi kaldiio.save_ark( fark, {uttid: (wave, rate)}, scp=fscp, append=True, write_function=f"soundfile_{suf}", ) else: writer[uttid] = rate, wave fnum_samples.write(f"{uttid} {len(wave)}\n") else: if args.audio_format.endswith("ark"): fark = open(Path(args.outdir) / f"data_{args.name}.ark", "wb") else: wavdir = Path(args.outdir) / f"data_{args.name}" wavdir.mkdir(parents=True, exist_ok=True) with Path(args.scp).open("r") as fscp, out_wavscp.open( "w") as fout, out_num_samples.open("w") as fnum_samples: for line in tqdm(fscp): uttid, wavpath = line.strip().split(None, 1) if wavpath.endswith("|"): # Streaming input e.g. cat a.wav | with kaldiio.open_like_kaldi(wavpath, "rb") as f: with BytesIO(f.read()) as g: wave, rate = soundfile.read(g, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs if args.audio_format.endswith("ark"): if "flac" in args.audio_format: suf = "flac" elif "wav" in args.audio_format: suf = "wav" else: raise RuntimeError("wav.ark or flac") # NOTE(kamo): Using extended ark format style here. # This format is incompatible with Kaldi kaldiio.save_ark( fark, {uttid: (wave, rate)}, scp=fout, append=True, write_function=f"soundfile_{suf}", ) else: owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") else: wave, rate = soundfile.read(wavpath, dtype=np.int16) if wave.ndim == 2 and utt2ref_channels is not None: wave = wave[:, utt2ref_channels(uttid)] save_asis = False elif args.audio_format.endswith("ark"): save_asis = False elif Path(wavpath).suffix == "." + args.audio_format and ( args.fs is None or args.fs == rate): save_asis = True else: save_asis = False if save_asis: # Neither --segments nor --fs are specified and # the line doesn't end with "|", # i.e. not using unix-pipe, # only in this case, # just using the original file as is. fout.write(f"{uttid} {wavpath}\n") else: if args.fs is not None and args.fs != rate: # FIXME(kamo): To use sox? wave = resampy.resample(wave.astype(np.float64), rate, args.fs, axis=0) wave = wave.astype(np.int16) rate = args.fs if args.audio_format.endswith("ark"): if "flac" in args.audio_format: suf = "flac" elif "wav" in args.audio_format: suf = "wav" else: raise RuntimeError("wav.ark or flac") # NOTE(kamo): Using extended ark format style here. # This format is not supported in Kaldi. kaldiio.save_ark( fark, {uttid: (wave, rate)}, scp=fout, append=True, write_function=f"soundfile_{suf}", ) else: owavpath = str(wavdir / f"{uttid}.{args.audio_format}") soundfile.write(owavpath, wave, rate) fout.write(f"{uttid} {owavpath}\n") fnum_samples.write(f"{uttid} {len(wave)}\n")
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], allow_variable_data_keys: bool, segment_size: Optional[float], hop_size: Optional[float], normalize_segment_scale: bool, show_progressbar: bool, num_spk: Optional[int], normalize_output_wav: bool, multiply_diar_result: bool, enh_s2t_task: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build separate_speech diarize_speech_kwargs = dict( train_config=train_config, model_file=model_file, segment_size=segment_size, hop_size=hop_size, normalize_segment_scale=normalize_segment_scale, show_progressbar=show_progressbar, normalize_output_wav=normalize_output_wav, num_spk=num_spk, device=device, dtype=dtype, multiply_diar_result=multiply_diar_result, enh_s2t_task=enh_s2t_task, ) diarize_speech = DiarizeSpeech.from_pretrained( model_tag=model_tag, **diarize_speech_kwargs, ) # 3. Build data-iterator loader = DiarizationTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=DiarizationTask.build_preprocess_fn( diarize_speech.diar_train_args, False), collate_fn=DiarizationTask.build_collate_fn( diarize_speech.diar_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop writer = NpyScpWriter(f"{output_dir}/predictions", f"{output_dir}/diarize.scp") if enh_s2t_task: wav_writers = [] if diarize_speech.num_spk is not None: for i in range(diarize_speech.num_spk): wav_writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) else: for i in range(diarize_speech.diar_model.mask_module.max_num_spk): wav_writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} if enh_s2t_task: waves, spk_predictions = diarize_speech(**batch) for b in range(batch_size): writer[keys[b]] = spk_predictions[b] for (spk, w) in enumerate(waves): wav_writers[spk][keys[b]] = fs, w[b] else: spk_predictions = diarize_speech(**batch) for b in range(batch_size): writer[keys[b]] = spk_predictions[b] if enh_s2t_task: for w in wav_writers: w.close() writer.close()
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], enh_train_config: str, enh_model_file: str, allow_variable_data_keys: bool, normalize_output_wav: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build Enh model enh_model, enh_train_args = EnhancementTask.build_model_from_file( enh_train_config, enh_model_file, device) enh_model.eval() num_spk = enh_model.num_spk # 3. Build data-iterator loader = EnhancementTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=EnhancementTask.build_preprocess_fn( enh_train_args, False), collate_fn=EnhancementTask.build_collate_fn(enh_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) writers = [] for i in range(num_spk): writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Enhancement Frontend waves, _, _ = enh_model.enh_model.forward_rawwav( batch["speech_mix"], batch["speech_mix_lengths"]) assert len(waves[0]) == batch_size, len(waves[0]) # FIXME(Chenda): will be incorrect when # batch size is not 1 or multi-channel case if normalize_output_wav: waves = [ (w / abs(w).max(dim=1, keepdim=True)[0] * 0.9).T.cpu().numpy() for w in waves ] # list[(sample,batch)] else: waves = [w.T.cpu().numpy() for w in waves] for (i, w) in enumerate(waves): writers[i][keys[0]] = fs, w for writer in writers: writer.close()