def test_NpyScpWriter(tmp_path: Path): array1 = np.random.randn(1) array2 = np.random.randn(1, 1, 10) with NpyScpWriter(tmp_path, tmp_path / "feats.scp") as writer: writer["abc"] = array1 writer["def"] = array2 target = NpyScpReader(tmp_path / "feats.scp") desired = {"abc": array1, "def": array2} for k in desired: t = target[k] d = desired[k] np.testing.assert_array_equal(t, d) assert writer.get_path("abc") == str(tmp_path / "abc.npy") assert writer.get_path("def") == str(tmp_path / "def.npy")
def npy_scp(tmp_path): p = tmp_path / "npy.scp" w = NpyScpWriter(tmp_path / "data", p) w["a"] = np.random.randn(100, 80) w["b"] = np.random.randn(150, 80) return str(p)
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], allow_variable_data_keys: bool, segment_size: Optional[float], show_progressbar: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build separate_speech diarize_speech_kwargs = dict( train_config=train_config, model_file=model_file, segment_size=segment_size, show_progressbar=show_progressbar, device=device, dtype=dtype, ) diarize_speech = DiarizeSpeech.from_pretrained( model_tag=model_tag, **diarize_speech_kwargs, ) # 3. Build data-iterator loader = DiarizationTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=DiarizationTask.build_preprocess_fn( diarize_speech.diar_train_args, False), collate_fn=DiarizationTask.build_collate_fn( diarize_speech.diar_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop writer = NpyScpWriter(f"{output_dir}/predictions", f"{output_dir}/diarize.scp") for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} spk_predictions = diarize_speech(**batch) for b in range(batch_size): writer[keys[b]] = spk_predictions[b] writer.close()
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_teacher_forcing: bool, use_att_constraint: bool, backward_window: int, forward_window: int, speed_control_alpha: float, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model text2speech = Text2Speech( train_config=train_config, model_file=model_file, threshold=threshold, maxlenratio=maxlenratio, minlenratio=minlenratio, use_teacher_forcing=use_teacher_forcing, use_att_constraint=use_att_constraint, backward_window=backward_window, forward_window=forward_window, speed_control_alpha=speed_control_alpha, vocoder_conf=vocoder_conf, dtype=dtype, device=device, ) # 3. Build data-iterator if not text2speech.use_speech: data_path_and_name_and_type = list( filter(lambda x: x[1] != "speech", data_path_and_name_and_type)) loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(text2speech.train_args, False), collate_fn=TTSTask.build_collate_fn(text2speech.train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "speech_shape").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) (output_dir / "att_ws").mkdir(parents=True, exist_ok=True) (output_dir / "probs").mkdir(parents=True, exist_ok=True) (output_dir / "durations").mkdir(parents=True, exist_ok=True) (output_dir / "focus_rates").mkdir(parents=True, exist_ok=True) # Lazy load to avoid the backend error matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator with NpyScpWriter( output_dir / "norm", output_dir / "norm/feats.scp", ) as norm_writer, NpyScpWriter( output_dir / "denorm", output_dir / "denorm/feats.scp") as denorm_writer, open( output_dir / "speech_shape/speech_shape", "w") as shape_writer, open(output_dir / "durations/durations", "w") as duration_writer, open( output_dir / "focus_rates/focus_rates", "w") as focus_rate_writer: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert _bs == 1, _bs # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() wav, outs, outs_denorm, probs, att_ws, duration, focus_rate = text2speech( **batch) key = keys[0] insize = next(iter(batch.values())).size(0) + 1 logging.info("inference speed = {:.1f} frames / sec.".format( int(outs.size(0)) / (time.perf_counter() - start_time))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") norm_writer[key] = outs.cpu().numpy() shape_writer.write(f"{key} " + ",".join(map(str, outs.shape)) + "\n") denorm_writer[key] = outs_denorm.cpu().numpy() if duration is not None: # Save duration and fucus rates duration_writer.write(f"{key} " + " ".join(map(str, duration.cpu().numpy())) + "\n") focus_rate_writer.write(f"{key} {float(focus_rate):.5f}\n") # Plot attention weight att_ws = att_ws.cpu().numpy() if att_ws.ndim == 2: att_ws = att_ws[None][None] elif att_ws.ndim != 4: raise RuntimeError( f"Must be 2 or 4 dimension: {att_ws.ndim}") w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1]) fig = plt.Figure(figsize=( w * 1.3 * min(att_ws.shape[0], 2.5), h * 1.3 * min(att_ws.shape[1], 2.5), )) fig.suptitle(f"{key}") axes = fig.subplots(att_ws.shape[0], att_ws.shape[1]) if len(att_ws) == 1: axes = [[axes]] for ax, att_w in zip(axes, att_ws): for ax_, att_w_ in zip(ax, att_w): ax_.imshow(att_w_.astype(np.float32), aspect="auto") ax_.set_xlabel("Input") ax_.set_ylabel("Output") ax_.xaxis.set_major_locator(MaxNLocator(integer=True)) ax_.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.set_tight_layout({"rect": [0, 0.03, 1, 0.95]}) fig.savefig(output_dir / f"att_ws/{key}.png") fig.clf() if probs is not None: # Plot stop token prediction probs = probs.cpu().numpy() fig = plt.Figure() ax = fig.add_subplot(1, 1, 1) ax.plot(probs) ax.set_title(f"{key}") ax.set_xlabel("Output") ax.set_ylabel("Stop probability") ax.set_ylim(0, 1) ax.grid(which="both") fig.set_tight_layout(True) fig.savefig(output_dir / f"probs/{key}.png") fig.clf() # TODO(kamo): Write scp if wav is not None: sf.write(f"{output_dir}/wav/{key}.wav", wav.numpy(), text2speech.fs, "PCM_16") # remove duration related files if attention is not provided if att_ws is None: shutil.rmtree(output_dir / "att_ws") shutil.rmtree(output_dir / "durations") shutil.rmtree(output_dir / "focus_rates") if probs is None: shutil.rmtree(output_dir / "probs")
def collect_stats( model: AbsESPnetModel, train_iter: DataLoader and Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], valid_iter: DataLoader and Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], output_dir: Path, ngpu: Optional[int], log_interval: Optional[int], write_collected_feats: bool, ) -> None: """Perform on collect_stats mode. Running for deriving the shape information from data and gathering statistics. This method is used before executing train(). """ assert check_argument_types() npy_scp_writers = {} for itr, mode in zip([train_iter, valid_iter], ["train", "valid"]): if log_interval is None: try: log_interval = max(len(itr) // 20, 10) except TypeError: log_interval = 100 sum_dict = defaultdict(lambda: 0) sq_dict = defaultdict(lambda: 0) count_dict = defaultdict(lambda: 0) with DatadirWriter(output_dir / mode) as datadir_writer: for iiter, (keys, batch) in enumerate(itr, 1): batch = to_device(batch, "cuda" if ngpu > 0 else "cpu") # 1. Write shape file for name in batch: if name.endswith("_lengths"): continue for i, (key, data) in enumerate(zip(keys, batch[name])): if f"{name}_lengths" in batch: lg = int(batch[f"{name}_lengths"][i]) data = data[:lg] datadir_writer[f"{name}_shape"][key] = ",".join( map(str, data.shape) ) # 2. Extract feats if ngpu <= 1: data = model.collect_feats(**batch) else: # Note that data_parallel can parallelize only "forward()" data = data_parallel( ForwardAdaptor(model, "collect_feats"), (), range(ngpu), module_kwargs=batch, ) # 3. Calculate sum and square sum for key, v in data.items(): for i, (uttid, seq) in enumerate(zip(keys, v.cpu().numpy())): # Truncate zero-padding region if f"{key}_lengths" in data: length = data[f"{key}_lengths"][i] # seq: (Length, Dim, ...) seq = seq[:length] else: # seq: (Dim, ...) -> (1, Dim, ...) seq = seq[None] # Accumulate value, its square, and count sum_dict[key] += seq.sum(0) sq_dict[key] += (seq ** 2).sum(0) count_dict[key] += len(seq) # 4. [Option] Write derived features as npy format file. if write_collected_feats: # Instantiate NpyScpWriter for the first iteration if (key, mode) not in npy_scp_writers: p = output_dir / mode / "collect_feats" npy_scp_writers[(key, mode)] = NpyScpWriter( p / f"data_{key}", p / f"{key}.scp" ) # Save array as npy file npy_scp_writers[(key, mode)][uttid] = seq if iiter % log_interval == 0: logging.info(f"Niter: {iiter}") for key in sum_dict: np.savez( output_dir / mode / f"{key}_stats.npz", count=count_dict[key], sum=sum_dict[key], sum_square=sq_dict[key], ) # batch_keys and stats_keys are used by aggregate_stats_dirs.py with (output_dir / mode / "batch_keys").open("w", encoding="utf-8") as f: f.write( "\n".join(filter(lambda x: not x.endswith("_lengths"), batch)) + "\n" ) with (output_dir / mode / "stats_keys").open("w", encoding="utf-8") as f: f.write("\n".join(sum_dict) + "\n")
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_att_constraint: bool, backward_window: int, forward_window: int, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model model, train_args = TTSTask.build_model_from_file(train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() tts = model.tts normalize = model.normalize logging.info(f"Normalization:\n{normalize}") logging.info(f"TTS:\n{tts}") # 3. Build data-iterator loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(train_args, False), collate_fn=TTSTask.build_collate_fn(train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Build converter from spectrogram to waveform if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf: spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {spc2wav}") else: spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient") # 5. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) (output_dir / "att_ws").mkdir(parents=True, exist_ok=True) (output_dir / "probs").mkdir(parents=True, exist_ok=True) with NpyScpWriter( output_dir / "norm", output_dir / "norm/feats.scp", ) as f, NpyScpWriter(output_dir / "denorm", output_dir / "denorm/feats.scp") as g: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = to_device(batch, device) key = keys[0] # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. _data = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() _decode_conf = { "threshold": threshold, "maxlenratio": maxlenratio, "minlenratio": minlenratio, } if isinstance(tts, Tacotron2): _decode_conf.update({ "use_att_constraint": use_att_constraint, "forward_window": forward_window, "backward_window": backward_window, }) outs, probs, att_ws = tts.inference(**_data, **_decode_conf) insize = next(iter(_data.values())).size(0) + 1 logging.info("inference speed = {:.1f} frames / sec.".format( int(outs.size(0)) / (time.perf_counter() - start_time))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") f[key] = outs.cpu().numpy() # NOTE: normalize.inverse is in-place operation outs_denorm = normalize.inverse(outs[None])[0][0] g[key] = outs_denorm.cpu().numpy() # Lazy load to avoid the backend error matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator # Plot attention weight att_ws = att_ws.cpu().numpy() if att_ws.ndim == 2: att_ws = att_ws[None][None] elif att_ws.ndim != 4: raise RuntimeError(f"Must be 2 or 4 dimension: {att_ws.ndim}") w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1]) fig = plt.Figure(figsize=( w * 1.3 * min(att_ws.shape[0], 2.5), h * 1.3 * min(att_ws.shape[1], 2.5), )) fig.suptitle(f"{key}") axes = fig.subplots(att_ws.shape[0], att_ws.shape[1]) if len(att_ws) == 1: axes = [[axes]] for ax, att_w in zip(axes, att_ws): for ax_, att_w_ in zip(ax, att_w): ax_.imshow(att_w_.astype(np.float32), aspect="auto") ax_.set_xlabel("Input") ax_.set_ylabel("Output") ax_.xaxis.set_major_locator(MaxNLocator(integer=True)) ax_.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.tight_layout(rect=[0, 0.03, 1, 0.95]) fig.savefig(output_dir / f"att_ws/{key}.png") fig.clf() # Plot stop token prediction probs = probs.cpu().numpy() fig = plt.Figure() ax = fig.add_subplot(1, 1, 1) ax.plot(probs) ax.set_title(f"{key}") ax.set_xlabel("Output") ax.set_ylabel("Stop probability") ax.set_ylim(0, 1) ax.grid(which="both") fig.tight_layout() fig.savefig(output_dir / f"probs/{key}.png") fig.clf() # TODO(kamo): Write scp if spc2wav is not None: wav = spc2wav(outs_denorm.cpu().numpy()) sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")