def __call__( self, text: Union[str, torch.Tensor, np.ndarray], speech: Union[torch.Tensor, np.ndarray] = None, durations: Union[torch.Tensor, np.ndarray] = None, spembs: Union[torch.Tensor, np.ndarray] = None, sids: Union[torch.Tensor, np.ndarray] = None, lids: Union[torch.Tensor, np.ndarray] = None, decode_conf: Optional[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """Run text-to-speech.""" assert check_argument_types() # check inputs if self.use_speech and speech is None: raise RuntimeError("missing required argument: 'speech'") # prepare batch if isinstance(text, str): text = self.preprocess_fn("<dummy>", dict(text=text))["text"] batch = dict(text=text) if speech is not None: batch.update(speech=speech) if durations is not None: batch.update(durations=durations) if spembs is not None: batch.update(spembs=spembs) if sids is not None: batch.update(sids=sids) if lids is not None: batch.update(lids=lids) batch = to_device(batch, self.device) # overwrite the decode configs if provided cfg = self.decode_conf if decode_conf is not None: cfg = self.decode_conf.copy() cfg.update(decode_conf) # inference if self.always_fix_seed: set_all_random_seed(self.seed) output_dict = self.model.inference(**batch, **cfg) # calculate additional metrics if output_dict.get("att_w") is not None: duration, focus_rate = self.duration_calculator(output_dict["att_w"]) output_dict.update(duration=duration, focus_rate=focus_rate) # apply vocoder (mel-to-wav) if self.vocoder is not None: if output_dict.get("feat_gen_denorm") is not None: input_feat = output_dict["feat_gen_denorm"] else: input_feat = output_dict["feat_gen"] wav = self.vocoder(input_feat) output_dict.update(wav=wav) return output_dict
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search(x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], allow_variable_data_keys: bool, segment_size: Optional[float], show_progressbar: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build separate_speech diarize_speech_kwargs = dict( train_config=train_config, model_file=model_file, segment_size=segment_size, show_progressbar=show_progressbar, device=device, dtype=dtype, ) diarize_speech = DiarizeSpeech.from_pretrained( model_tag=model_tag, **diarize_speech_kwargs, ) # 3. Build data-iterator loader = DiarizationTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=DiarizationTask.build_preprocess_fn( diarize_speech.diar_train_args, False), collate_fn=DiarizationTask.build_collate_fn( diarize_speech.diar_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop writer = NpyScpWriter(f"{output_dir}/predictions", f"{output_dir}/diarize.scp") for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} spk_predictions = diarize_speech(**batch) for b in range(batch_size): writer[keys[b]] = spk_predictions[b] writer.close()
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_teacher_forcing: bool, use_att_constraint: bool, backward_window: int, forward_window: int, speed_control_alpha: float, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model text2speech = Text2Speech( train_config=train_config, model_file=model_file, threshold=threshold, maxlenratio=maxlenratio, minlenratio=minlenratio, use_teacher_forcing=use_teacher_forcing, use_att_constraint=use_att_constraint, backward_window=backward_window, forward_window=forward_window, speed_control_alpha=speed_control_alpha, vocoder_conf=vocoder_conf, dtype=dtype, device=device, ) # 3. Build data-iterator if not text2speech.use_speech: data_path_and_name_and_type = list( filter(lambda x: x[1] != "speech", data_path_and_name_and_type)) loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(text2speech.train_args, False), collate_fn=TTSTask.build_collate_fn(text2speech.train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "speech_shape").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) (output_dir / "att_ws").mkdir(parents=True, exist_ok=True) (output_dir / "probs").mkdir(parents=True, exist_ok=True) (output_dir / "durations").mkdir(parents=True, exist_ok=True) (output_dir / "focus_rates").mkdir(parents=True, exist_ok=True) # Lazy load to avoid the backend error matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator with NpyScpWriter( output_dir / "norm", output_dir / "norm/feats.scp", ) as norm_writer, NpyScpWriter( output_dir / "denorm", output_dir / "denorm/feats.scp") as denorm_writer, open( output_dir / "speech_shape/speech_shape", "w") as shape_writer, open(output_dir / "durations/durations", "w") as duration_writer, open( output_dir / "focus_rates/focus_rates", "w") as focus_rate_writer: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert _bs == 1, _bs # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() wav, outs, outs_denorm, probs, att_ws, duration, focus_rate = text2speech( **batch) key = keys[0] insize = next(iter(batch.values())).size(0) + 1 logging.info("inference speed = {:.1f} frames / sec.".format( int(outs.size(0)) / (time.perf_counter() - start_time))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") norm_writer[key] = outs.cpu().numpy() shape_writer.write(f"{key} " + ",".join(map(str, outs.shape)) + "\n") denorm_writer[key] = outs_denorm.cpu().numpy() if duration is not None: # Save duration and fucus rates duration_writer.write(f"{key} " + " ".join(map(str, duration.cpu().numpy())) + "\n") focus_rate_writer.write(f"{key} {float(focus_rate):.5f}\n") # Plot attention weight att_ws = att_ws.cpu().numpy() if att_ws.ndim == 2: att_ws = att_ws[None][None] elif att_ws.ndim != 4: raise RuntimeError( f"Must be 2 or 4 dimension: {att_ws.ndim}") w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1]) fig = plt.Figure(figsize=( w * 1.3 * min(att_ws.shape[0], 2.5), h * 1.3 * min(att_ws.shape[1], 2.5), )) fig.suptitle(f"{key}") axes = fig.subplots(att_ws.shape[0], att_ws.shape[1]) if len(att_ws) == 1: axes = [[axes]] for ax, att_w in zip(axes, att_ws): for ax_, att_w_ in zip(ax, att_w): ax_.imshow(att_w_.astype(np.float32), aspect="auto") ax_.set_xlabel("Input") ax_.set_ylabel("Output") ax_.xaxis.set_major_locator(MaxNLocator(integer=True)) ax_.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.set_tight_layout({"rect": [0, 0.03, 1, 0.95]}) fig.savefig(output_dir / f"att_ws/{key}.png") fig.clf() if probs is not None: # Plot stop token prediction probs = probs.cpu().numpy() fig = plt.Figure() ax = fig.add_subplot(1, 1, 1) ax.plot(probs) ax.set_title(f"{key}") ax.set_xlabel("Output") ax.set_ylabel("Stop probability") ax.set_ylim(0, 1) ax.grid(which="both") fig.set_tight_layout(True) fig.savefig(output_dir / f"probs/{key}.png") fig.clf() # TODO(kamo): Write scp if wav is not None: sf.write(f"{output_dir}/wav/{key}.wav", wav.numpy(), text2speech.fs, "PCM_16") # remove duration related files if attention is not provided if att_ws is None: shutil.rmtree(output_dir / "att_ws") shutil.rmtree(output_dir / "durations") shutil.rmtree(output_dir / "focus_rates") if probs is None: shutil.rmtree(output_dir / "probs")
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, model_tag: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, maskctc_n_iterations: int, maskctc_threshold_probability: float, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, asr_model_file=asr_model_file, token_type=token_type, bpemodel=bpemodel, device=device, batch_size=batch_size, dtype=dtype, maskctc_n_iterations=maskctc_n_iterations, maskctc_threshold_probability=maskctc_threshold_probability, ) speech2text = Speech2Text.from_pretrained( model_tag=model_tag, **speech2text_kwargs, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } try: results = speech2text(**batch) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) results = [[" ", ["<space>"], [2], hyp]] # Only supporting batch_size==1 key = keys[0] (text, token, token_int, hyp) = results[0] # Create a directory: outdir/{n}best_recog ibest_writer = writer["1best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def run( cls, model: AbsESPnetModel, optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], train_iter_factory: AbsIterFactory, valid_iter_factory: AbsIterFactory, plot_attention_iter_factory: Optional[AbsIterFactory], reporter: Reporter, scaler: Optional[GradScaler], output_dir: Path, max_epoch: int, seed: int, patience: Optional[int], keep_nbest_models: int, early_stopping_criterion: Sequence[str], best_model_criterion: Sequence[Sequence[str]], val_scheduler_criterion: Sequence[str], trainer_options, distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) start_epoch = reporter.get_epoch() + 1 if start_epoch == max_epoch + 1: logging.warning( f"The training has already reached at max_epoch: {start_epoch}" ) if distributed_option.distributed: dp_model = torch.nn.parallel.DistributedDataParallel( model, device_ids=( # Perform multi-Process with multi-GPUs [torch.cuda.current_device()] if distributed_option.ngpu == 1 # Perform single-Process with multi-GPUs else None), output_device=(torch.cuda.current_device() if distributed_option.ngpu == 1 else None), ) elif distributed_option.ngpu > 1: dp_model = torch.nn.parallel.DataParallel( model, device_ids=list(range(distributed_option.ngpu)), ) else: # NOTE(kamo): DataParallel also should work with ngpu=1, # but for debuggability it's better to keep this block. dp_model = model if not distributed_option.distributed or distributed_option.dist_rank == 0: summary_writer = SummaryWriter(str(output_dir / "tensorboard")) else: summary_writer = None start_time = time.perf_counter() for iepoch in range(start_epoch, max_epoch + 1): if iepoch != start_epoch: logging.info( "{}/{}epoch started. Estimated time to finish: {}".format( iepoch, max_epoch, humanfriendly.format_timespan( (time.perf_counter() - start_time) / (iepoch - start_epoch) * (max_epoch - iepoch + 1)), )) else: logging.info(f"{iepoch}/{max_epoch}epoch started") set_all_random_seed(seed + iepoch) reporter.set_epoch(iepoch) # 1. Train and validation for one-epoch with reporter.observe("train") as sub_reporter: all_steps_are_invalid = cls.train_one_epoch( model=dp_model, optimizers=optimizers, schedulers=schedulers, iterator=train_iter_factory.build_iter(iepoch), reporter=sub_reporter, scaler=scaler, summary_writer=summary_writer, options=trainer_options, ) with reporter.observe("valid") as sub_reporter: cls.validate_one_epoch( model=dp_model, iterator=valid_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, ) if not distributed_option.distributed or distributed_option.dist_rank == 0: # att_plot doesn't support distributed if plot_attention_iter_factory is not None: with reporter.observe("att_plot") as sub_reporter: cls.plot_attention( model=model, output_dir=output_dir / "att_ws", summary_writer=summary_writer, iterator=plot_attention_iter_factory.build_iter( iepoch), reporter=sub_reporter, options=trainer_options, ) # 2. LR Scheduler step for scheduler in schedulers: if isinstance(scheduler, AbsValEpochStepScheduler): scheduler.step( reporter.get_value(*val_scheduler_criterion)) elif isinstance(scheduler, AbsEpochStepScheduler): scheduler.step() if not distributed_option.distributed or distributed_option.dist_rank == 0: # 3. Report the results logging.info(reporter.log_message()) reporter.matplotlib_plot(output_dir / "images") reporter.tensorboard_add_scalar(summary_writer) # 4. Save/Update the checkpoint torch.save( { "model": model.state_dict(), "reporter": reporter.state_dict(), "optimizers": [o.state_dict() for o in optimizers], "schedulers": [ s.state_dict() if s is not None else None for s in schedulers ], "scaler": scaler.state_dict() if scaler is not None else None, }, output_dir / "checkpoint.pth", ) # 5. Save the model and update the link to the best model torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") # Creates a sym link latest.pth -> {iepoch}epoch.pth p = output_dir / "latest.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved = [] for _phase, k, _mode in best_model_criterion: # e.g. _phase, k, _mode = "train", "loss", "min" if reporter.has(_phase, k): best_epoch = reporter.get_best_epoch(_phase, k, _mode) # Creates sym links if it's the best result if best_epoch == iepoch: p = output_dir / f"{_phase}.{k}.best.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") else: logging.info("The best model has been updated: " + ", ".join(_improved)) # 6. Remove the model files excluding n-best epoch and latest epoch _removed = [] # Get the union set of the n-best among multiple criterion nbests = set().union(*[ set(reporter.sort_epochs(ph, k, m)[:keep_nbest_models]) for ph, k, m in best_model_criterion if reporter.has(ph, k) ]) for e in range(1, iepoch): p = output_dir / f"{e}epoch.pth" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) if len(_removed) != 0: logging.info("The model files were removed: " + ", ".join(_removed)) # 7. If any updating haven't happened, stops the training if all_steps_are_invalid: logging.warning( f"The gradients at all steps are invalid in this epoch. " f"Something seems wrong. This training was stopped at {iepoch}epoch" ) break # 8. Check early stopping if patience is not None: if reporter.check_early_stopping(patience, *early_stopping_criterion): break else: logging.info(f"The training was finished at {max_epoch} epochs ")
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, sim_chunk_length: int, disable_repetition_detection: bool, encoded_feat_length_limit: int, decoder_text_length_limit: int, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = Speech2TextStreaming( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, disable_repetition_detection=disable_repetition_detection, decoder_text_length_limit=decoder_text_length_limit, encoded_feat_length_limit=encoded_feat_length_limit, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } assert len(batch.keys()) == 1 try: if sim_chunk_length == 0: # N-best list of (text, token, token_int, hyp_object) results = speech2text(**batch) else: speech = batch["speech"] for i in range(len(speech) // sim_chunk_length): speech2text( speech=speech[i * sim_chunk_length:(i + 1) * sim_chunk_length], is_final=False, ) results = speech2text(speech[(i + 1) * sim_chunk_length:len(speech)], is_final=True) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) results = [[" ", ["<space>"], [2], hyp]] * nbest # Only supporting batch_size==1 key = keys[0] for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_att_constraint: bool, backward_window: int, forward_window: int, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model model, train_args = TTSTask.build_model_from_file(train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() tts = model.tts normalize = model.normalize logging.info(f"Normalization:\n{normalize}") logging.info(f"TTS:\n{tts}") # 3. Build data-iterator loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(train_args, False), collate_fn=TTSTask.build_collate_fn(train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Build converter from spectrogram to waveform if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf: spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {spc2wav}") else: spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient") # 5. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) # FIXME(kamo): I think we shouldn't depend on kaldi-format any more. # How about numpy or HDF5? # >>> with NpyScpWriter() as f: with kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format( o=output_dir / "norm/feats")) as f, kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=output_dir / "denorm/feats")) as g: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = to_device(batch, device) key = keys[0] # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. _data = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() # TODO(kamo): Now att_ws is not used. outs, probs, att_ws = tts.inference( **_data, threshold=threshold, maxlenratio=maxlenratio, minlenratio=minlenratio, ) outs_denorm = normalize.inverse(outs[None])[0][0] insize = next(iter(_data.values())).size(0) logging.info("inference speed = {} msec / frame.".format( (time.perf_counter() - start_time) / (int(outs.size(0)) * 1000))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") f[key] = outs.cpu().numpy() g[key] = outs_denorm.cpu().numpy() # TODO(kamo): Write scp if spc2wav is not None: wav = spc2wav(outs_denorm.cpu().numpy()) sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_att_constraint: bool, backward_window: int, forward_window: int, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model model, train_args = TTSTask.build_model_from_file(train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() tts = model.tts normalize = model.normalize logging.info(f"Normalization:\n{normalize}") logging.info(f"TTS:\n{tts}") # 3. Build data-iterator loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(train_args, False), collate_fn=TTSTask.build_collate_fn(train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Build converter from spectrogram to waveform if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf: spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {spc2wav}") else: spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient") # 5. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) (output_dir / "att_ws").mkdir(parents=True, exist_ok=True) (output_dir / "probs").mkdir(parents=True, exist_ok=True) with NpyScpWriter( output_dir / "norm", output_dir / "norm/feats.scp", ) as f, NpyScpWriter(output_dir / "denorm", output_dir / "denorm/feats.scp") as g: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = to_device(batch, device) key = keys[0] # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. _data = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() _decode_conf = { "threshold": threshold, "maxlenratio": maxlenratio, "minlenratio": minlenratio, } if isinstance(tts, Tacotron2): _decode_conf.update({ "use_att_constraint": use_att_constraint, "forward_window": forward_window, "backward_window": backward_window, }) outs, probs, att_ws = tts.inference(**_data, **_decode_conf) insize = next(iter(_data.values())).size(0) + 1 logging.info("inference speed = {:.1f} frames / sec.".format( int(outs.size(0)) / (time.perf_counter() - start_time))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") f[key] = outs.cpu().numpy() # NOTE: normalize.inverse is in-place operation outs_denorm = normalize.inverse(outs[None])[0][0] g[key] = outs_denorm.cpu().numpy() # Lazy load to avoid the backend error matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator # Plot attention weight att_ws = att_ws.cpu().numpy() if att_ws.ndim == 2: att_ws = att_ws[None][None] elif att_ws.ndim != 4: raise RuntimeError(f"Must be 2 or 4 dimension: {att_ws.ndim}") w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1]) fig = plt.Figure(figsize=( w * 1.3 * min(att_ws.shape[0], 2.5), h * 1.3 * min(att_ws.shape[1], 2.5), )) fig.suptitle(f"{key}") axes = fig.subplots(att_ws.shape[0], att_ws.shape[1]) if len(att_ws) == 1: axes = [[axes]] for ax, att_w in zip(axes, att_ws): for ax_, att_w_ in zip(ax, att_w): ax_.imshow(att_w_.astype(np.float32), aspect="auto") ax_.set_xlabel("Input") ax_.set_ylabel("Output") ax_.xaxis.set_major_locator(MaxNLocator(integer=True)) ax_.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.tight_layout(rect=[0, 0.03, 1, 0.95]) fig.savefig(output_dir / f"att_ws/{key}.png") fig.clf() # Plot stop token prediction probs = probs.cpu().numpy() fig = plt.Figure() ax = fig.add_subplot(1, 1, 1) ax.plot(probs) ax.set_title(f"{key}") ax.set_xlabel("Output") ax.set_ylabel("Stop probability") ax.set_ylim(0, 1) ax.grid(which="both") fig.tight_layout() fig.savefig(output_dir / f"probs/{key}.png") fig.clf() # TODO(kamo): Write scp if spc2wav is not None: wav = spc2wav(outs_denorm.cpu().numpy()) sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, streaming: bool, ): assert check_argument_types() if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = k2Speech2Text( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, streaming=streaming, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) with DatadirWriter(output_dir) as writer: for batch_idx, (keys, batch) in enumerate(loader): if batch_idx % 10 == 0: logging.info(f"Processing {batch_idx} batch") assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" # 1-best list of (text, token, token_int) results = speech2text(batch) for key_idx, (text, token, token_int, score) in enumerate(results): key = keys[key_idx] best_writer = writer["1best_recog"] # Write the result to each file best_writer["token"][key] = " ".join(token) best_writer["token_int"][key] = " ".join(map(str, token_int)) best_writer["score"][key] = str(score) if text is not None: best_writer["text"][key] = text
def main_worker(cls, args: argparse.Namespace): assert check_argument_types() # 0. Init distributed process distributed_option = build_dataclass(DistributedOption, args) # Setting distributed_option.dist_rank, etc. distributed_option.init_options() # NOTE(kamo): Don't use logging before invoking logging.basicConfig() if not distributed_option.distributed or distributed_option.dist_rank == 0: if not distributed_option.distributed: _rank = "" else: _rank = (f":{distributed_option.dist_rank}/" f"{distributed_option.dist_world_size}") # NOTE(kamo): # logging.basicConfig() is invoked in main_worker() instead of main() # because it can be invoked only once in a process. # FIXME(kamo): Should we use logging.getLogger()? logging.basicConfig( level=args.log_level, format=f"[{os.uname()[1].split('.')[0]}{_rank}]" f" %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: # Suppress logging if RANK != 0 logging.basicConfig( level="ERROR", format=f"[{os.uname()[1].split('.')[0]}" f":{distributed_option.dist_rank}/{distributed_option.dist_world_size}]" f" %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) # Invoking torch.distributed.init_process_group distributed_option.init_torch_distributed() # 1. Set random-seed set_all_random_seed(args.seed) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark torch.backends.cudnn.deterministic = args.cudnn_deterministic if args.detect_anomaly: logging.info("Invoking torch.autograd.set_detect_anomaly(True)") torch.autograd.set_detect_anomaly(args.detect_anomaly) # 2. Build model model = cls.build_model(args=args) if not isinstance(model, AbsESPnetModel): raise RuntimeError( f"model must inherit {AbsESPnetModel.__name__}, but got {type(model)}" ) model = model.to( dtype=getattr(torch, args.train_dtype), device="cuda" if args.ngpu > 0 else "cpu", ) for t in args.freeze_param: for k, p in model.named_parameters(): if k.startswith(t + ".") or k == t: logging.info(f"Setting {k}.requires_grad = False") p.requires_grad = False # 3. Build optimizer optimizers = cls.build_optimizers(args, model=model) # 4. Build schedulers schedulers = [] for i, optim in enumerate(optimizers, 1): suf = "" if i == 1 else str(i) name = getattr(args, f"scheduler{suf}") conf = getattr(args, f"scheduler{suf}_conf") if name is not None: # cls_ = scheduler_classes.get(name) cls_ = TriStageLR if cls_ is None: raise ValueError( f"must be one of {list(scheduler_classes)}: {name}") scheduler = cls_(optim, **conf) else: scheduler = None schedulers.append(scheduler) logging.info(pytorch_cudnn_version()) logging.info(model_summary(model)) for i, (o, s) in enumerate(zip(optimizers, schedulers), 1): suf = "" if i == 1 else str(i) logging.info(f"Optimizer{suf}:\n{o}") logging.info(f"Scheduler{suf}: {s}") # 5. Dump "args" to config.yaml # NOTE(kamo): "args" should be saved after object-buildings are done # because they are allowed to modify "args". output_dir = Path(args.output_dir) if not distributed_option.distributed or distributed_option.dist_rank == 0: output_dir.mkdir(parents=True, exist_ok=True) with (output_dir / "config.yaml").open("w", encoding="utf-8") as f: logging.info( f'Saving the configuration in {output_dir / "config.yaml"}' ) yaml_no_alias_safe_dump(vars(args), f, indent=4, sort_keys=False) # 6. Loads pre-trained model for p in args.init_param: logging.info(f"Loading pretrained params from {p}") load_pretrained_model( model=model, init_param=p, # NOTE(kamo): "cuda" for torch.load always indicates cuda:0 # in PyTorch<=1.4 map_location=f"cuda:{torch.cuda.current_device()}" if args.ngpu > 0 else "cpu", ) if args.dry_run: pass elif args.collect_stats: # Perform on collect_stats mode. This mode has two roles # - Derive the length and dimension of all input data # - Accumulate feats, square values, and the length for whitening logging.info(args) if args.valid_batch_size is None: args.valid_batch_size = args.batch_size if len(args.train_shape_file) != 0: train_key_file = args.train_shape_file[0] else: train_key_file = None if len(args.train_pseudo_shape_file) != 0: train_pseudo_key_file = args.train_pseudo_shape_file[0] else: train_pseudo_key_file = None if len(args.valid_shape_file) != 0: valid_key_file = args.valid_shape_file[0] else: valid_key_file = None collect_stats( model=model, train_iter=cls.build_streaming_iterator( data_path_and_name_and_type=args. train_data_path_and_name_and_type, key_file=train_key_file, batch_size=args.batch_size, dtype=args.train_dtype, num_workers=args.num_workers, allow_variable_data_keys=args.allow_variable_data_keys, ngpu=args.ngpu, preprocess_fn=cls.build_preprocess_fn(args, train=False), collate_fn=cls.build_collate_fn(args, train=False), ), train_pseudo_iter=cls.build_streaming_iterator( data_path_and_name_and_type=args. train_pseudo_data_path_and_name_and_type, key_file=train_pseudo_key_file, batch_size=args.batch_size, dtype=args.train_dtype, num_workers=args.num_workers, allow_variable_data_keys=args.allow_variable_data_keys, ngpu=args.ngpu, preprocess_fn=cls.build_preprocess_fn(args, train=False), collate_fn=cls.build_collate_fn(args, train=False), ), valid_iter=cls.build_streaming_iterator( data_path_and_name_and_type=args. valid_data_path_and_name_and_type, key_file=valid_key_file, batch_size=args.valid_batch_size, dtype=args.train_dtype, num_workers=args.num_workers, allow_variable_data_keys=args.allow_variable_data_keys, ngpu=args.ngpu, preprocess_fn=cls.build_preprocess_fn(args, train=False), collate_fn=cls.build_collate_fn(args, train=False), ), output_dir=output_dir, ngpu=args.ngpu, log_interval=args.log_interval, write_collected_feats=args.write_collected_feats, ) else: # 7. Build iterator factories assert not args.multiple_iterator train_iter_factory = cls.build_iter_factory( args=args, distributed_option=distributed_option, mode="train", ) train_pseudo_iter_factory = cls.build_iter_factory( args=args, distributed_option=distributed_option, mode="pseudo", ) valid_iter_factory = cls.build_iter_factory( args=args, distributed_option=distributed_option, mode="valid", ) if args.num_att_plot != 0: plot_attention_iter_factory = cls.build_iter_factory( args=args, distributed_option=distributed_option, mode="plot_att", ) else: plot_attention_iter_factory = None # 8. Start training if args.use_wandb: if (not distributed_option.distributed or distributed_option.dist_rank == 0): if args.wandb_project is None: project = ("ESPnet_" + cls.__name__ + str(Path(".").resolve()).replace("/", "_")) else: project = args.wandb_project if args.wandb_id is None: wandb_id = str(output_dir).replace("/", "_") else: wandb_id = args.wandb_id wandb.init( project=project, dir=output_dir, id=wandb_id, resume="allow", ) wandb.config.update(args) else: # wandb also supports grouping for distributed training, # but we only logs aggregated data, # so it's enough to perform on rank0 node. args.use_wandb = False # Don't give args to trainer.run() directly!!! # Instead of it, define "Options" object and build here. trainer_options = cls.trainer.build_options(args) cls.trainer.run( model=model, optimizers=optimizers, schedulers=schedulers, train_iter_factory=train_iter_factory, train_pseudo_iter_factory=train_pseudo_iter_factory, valid_iter_factory=valid_iter_factory, plot_attention_iter_factory=plot_attention_iter_factory, trainer_options=trainer_options, distributed_option=distributed_option, )
def test_set_all_random_seed(): set_all_random_seed(0)
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], enh_train_config: str, enh_model_file: str, allow_variable_data_keys: bool, normalize_output_wav: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build Enh model enh_model, enh_train_args = EnhancementTask.build_model_from_file( enh_train_config, enh_model_file, device) enh_model.eval() num_spk = enh_model.num_spk # 3. Build data-iterator loader = EnhancementTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=EnhancementTask.build_preprocess_fn( enh_train_args, False), collate_fn=EnhancementTask.build_collate_fn(enh_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) writers = [] for i in range(num_spk): writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Enhancement Frontend waves, _, _ = enh_model.enh_model.forward_rawwav( batch["speech_mix"], batch["speech_mix_lengths"]) assert len(waves[0]) == batch_size, len(waves[0]) # FIXME(Chenda): will be incorrect when # batch size is not 1 or multi-channel case if normalize_output_wav: waves = [ (w / abs(w).max(dim=1, keepdim=True)[0] * 0.9).T.cpu().numpy() for w in waves ] # list[(sample,batch)] else: waves = [w.T.cpu().numpy() for w in waves] for (i, w) in enumerate(waves): writers[i][keys[0]] = fs, w for writer in writers: writer.close()
def run( cls, model: AbsESPnetModel, optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], train_iter_factory: AbsIterFactory, valid_iter_factory: AbsIterFactory, plot_attention_iter_factory: Optional[AbsIterFactory], trainer_options, distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers)) if isinstance(trainer_options.keep_nbest_models, int): keep_nbest_models = [trainer_options.keep_nbest_models] else: if len(trainer_options.keep_nbest_models) == 0: logging.warning("No keep_nbest_models is given. Change to [1]") trainer_options.keep_nbest_models = [1] keep_nbest_models = trainer_options.keep_nbest_models output_dir = Path(trainer_options.output_dir) reporter = Reporter() if trainer_options.use_amp: if LooseVersion(torch.__version__) < LooseVersion("1.6.0"): raise RuntimeError( "Require torch>=1.6.0 for Automatic Mixed Precision") if trainer_options.sharded_ddp: if fairscale is None: raise RuntimeError( "Requiring fairscale. Do 'pip install fairscale'") scaler = fairscale.optim.grad_scaler.ShardedGradScaler() else: scaler = GradScaler() else: scaler = None if trainer_options.resume and (output_dir / "checkpoint.pth").exists(): cls.resume( checkpoint=output_dir / "checkpoint.pth", model=model, optimizers=optimizers, schedulers=schedulers, reporter=reporter, scaler=scaler, ngpu=trainer_options.ngpu, ) start_epoch = reporter.get_epoch() + 1 if start_epoch == trainer_options.max_epoch + 1: logging.warning( f"The training has already reached at max_epoch: {start_epoch}" ) if distributed_option.distributed: if trainer_options.sharded_ddp: dp_model = fairscale.nn.data_parallel.ShardedDataParallel( module=model, sharded_optimizer=optimizers, ) else: dp_model = torch.nn.parallel.DistributedDataParallel( model, device_ids=( # Perform multi-Process with multi-GPUs [torch.cuda.current_device()] if distributed_option.ngpu == 1 # Perform single-Process with multi-GPUs else None), output_device=(torch.cuda.current_device() if distributed_option.ngpu == 1 else None), find_unused_parameters=trainer_options.unused_parameters, ) elif distributed_option.ngpu > 1: dp_model = torch.nn.parallel.DataParallel( model, device_ids=list(range(distributed_option.ngpu)), ) else: # NOTE(kamo): DataParallel also should work with ngpu=1, # but for debuggability it's better to keep this block. dp_model = model if trainer_options.use_tensorboard and ( not distributed_option.distributed or distributed_option.dist_rank == 0): from torch.utils.tensorboard import SummaryWriter train_summary_writer = SummaryWriter( str(output_dir / "tensorboard" / "train")) valid_summary_writer = SummaryWriter( str(output_dir / "tensorboard" / "valid")) else: train_summary_writer = None start_time = time.perf_counter() for iepoch in range(start_epoch, trainer_options.max_epoch + 1): if iepoch != start_epoch: logging.info( "{}/{}epoch started. Estimated time to finish: {}".format( iepoch, trainer_options.max_epoch, humanfriendly.format_timespan( (time.perf_counter() - start_time) / (iepoch - start_epoch) * (trainer_options.max_epoch - iepoch + 1)), )) else: logging.info( f"{iepoch}/{trainer_options.max_epoch}epoch started") set_all_random_seed(trainer_options.seed + iepoch) reporter.set_epoch(iepoch) # 1. Train and validation for one-epoch with reporter.observe("train") as sub_reporter: all_steps_are_invalid = cls.train_one_epoch( model=dp_model, optimizers=optimizers, schedulers=schedulers, iterator=train_iter_factory.build_iter(iepoch), reporter=sub_reporter, scaler=scaler, summary_writer=train_summary_writer, options=trainer_options, distributed_option=distributed_option, ) with reporter.observe("valid") as sub_reporter: cls.validate_one_epoch( model=dp_model, iterator=valid_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, distributed_option=distributed_option, ) if not distributed_option.distributed or distributed_option.dist_rank == 0: # att_plot doesn't support distributed if plot_attention_iter_factory is not None: with reporter.observe("att_plot") as sub_reporter: cls.plot_attention( model=model, output_dir=output_dir / "att_ws", summary_writer=train_summary_writer, iterator=plot_attention_iter_factory.build_iter( iepoch), reporter=sub_reporter, options=trainer_options, ) # 2. LR Scheduler step for scheduler in schedulers: if isinstance(scheduler, AbsValEpochStepScheduler): scheduler.step( reporter.get_value( *trainer_options.val_scheduler_criterion)) elif isinstance(scheduler, AbsEpochStepScheduler): scheduler.step() if trainer_options.sharded_ddp: for optimizer in optimizers: if isinstance(optimizer, fairscale.optim.oss.OSS): optimizer.consolidate_state_dict() if not distributed_option.distributed or distributed_option.dist_rank == 0: # 3. Report the results logging.info(reporter.log_message()) if trainer_options.use_matplotlib: reporter.matplotlib_plot(output_dir / "images") if train_summary_writer is not None: reporter.tensorboard_add_scalar(train_summary_writer, key1="train") reporter.tensorboard_add_scalar(valid_summary_writer, key1="valid") if trainer_options.use_wandb: reporter.wandb_log() # 4. Save/Update the checkpoint torch.save( { "model": model.state_dict(), "reporter": reporter.state_dict(), "optimizers": [o.state_dict() for o in optimizers], "schedulers": [ s.state_dict() if s is not None else None for s in schedulers ], "scaler": scaler.state_dict() if scaler is not None else None, }, output_dir / "checkpoint.pth", ) # 5. Save and log the model and update the link to the best model torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") # Creates a sym link latest.pth -> {iepoch}epoch.pth p = output_dir / "latest.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved = [] for _phase, k, _mode in trainer_options.best_model_criterion: # e.g. _phase, k, _mode = "train", "loss", "min" if reporter.has(_phase, k): best_epoch = reporter.get_best_epoch(_phase, k, _mode) # Creates sym links if it's the best result if best_epoch == iepoch: p = output_dir / f"{_phase}.{k}.best.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") else: logging.info("The best model has been updated: " + ", ".join(_improved)) log_model = (trainer_options.wandb_model_log_interval > 0 and iepoch % trainer_options.wandb_model_log_interval == 0) if log_model and trainer_options.use_wandb: import wandb logging.info("Logging Model on this epoch :::::") artifact = wandb.Artifact( name=f"model_{wandb.run.id}", type="model", metadata={"improved": _improved}, ) artifact.add_file(str(output_dir / f"{iepoch}epoch.pth")) aliases = [ f"epoch-{iepoch}", "best" if best_epoch == iepoch else "", ] wandb.log_artifact(artifact, aliases=aliases) # 6. Remove the model files excluding n-best epoch and latest epoch _removed = [] # Get the union set of the n-best among multiple criterion nbests = set().union(*[ set( reporter.sort_epochs(ph, k, m) [:max(keep_nbest_models)]) for ph, k, m in trainer_options.best_model_criterion if reporter.has(ph, k) ]) # Generated n-best averaged model if (trainer_options.nbest_averaging_interval > 0 and iepoch % trainer_options.nbest_averaging_interval == 0): average_nbest_models( reporter=reporter, output_dir=output_dir, best_model_criterion=trainer_options. best_model_criterion, nbest=keep_nbest_models, suffix=f"till{iepoch}epoch", ) for e in range(1, iepoch): p = output_dir / f"{e}epoch.pth" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) if len(_removed) != 0: logging.info("The model files were removed: " + ", ".join(_removed)) # 7. If any updating haven't happened, stops the training if all_steps_are_invalid: logging.warning( f"The gradients at all steps are invalid in this epoch. " f"Something seems wrong. This training was stopped at {iepoch}epoch" ) break # 8. Check early stopping if trainer_options.patience is not None: if reporter.check_early_stopping( trainer_options.patience, *trainer_options.early_stopping_criterion): break else: logging.info( f"The training was finished at {trainer_options.max_epoch} epochs " ) # Generated n-best averaged model if not distributed_option.distributed or distributed_option.dist_rank == 0: average_nbest_models( reporter=reporter, output_dir=output_dir, best_model_criterion=trainer_options.best_model_criterion, nbest=keep_nbest_models, )
def calc_perplexity( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], log_base: Optional[float], allow_variable_data_keys: bool, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build LM model, train_args = LMTask.build_model_from_file(train_config, model_file, device) # Wrape model to make model.nll() data-parallel wrapped_model = ForwardAdaptor(model, "nll") wrapped_model.to(dtype=getattr(torch, dtype)).eval() logging.info(f"Model:\n{model}") # 3. Build data-iterator loader = LMTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=LMTask.build_preprocess_fn(train_args, False), collate_fn=LMTask.build_collate_fn(train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop with DatadirWriter(output_dir) as writer: total_nll = 0.0 total_ntokens = 0 for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): batch = to_device(batch, device) if ngpu <= 1: # NOTE(kamo): data_parallel also should work with ngpu=1, # but for debuggability it's better to keep this block. nll, lengths = wrapped_model(**batch) else: nll, lengths = data_parallel( wrapped_model, (), range(ngpu), module_kwargs=batch ) assert _bs == len(nll) == len(lengths), (_bs, len(nll), len(lengths)) # nll: (B, L) -> (B,) nll = nll.detach().cpu().numpy().sum(1) # lengths: (B,) lengths = lengths.detach().cpu().numpy() total_nll += nll.sum() total_ntokens += lengths.sum() for key, _nll, ntoken in zip(keys, nll, lengths): if log_base is None: utt_ppl = np.exp(_nll / ntoken) else: utt_ppl = log_base ** (_nll / ntoken / np.log(log_base)) # Write PPL of each utts for debugging or analysis writer["utt2ppl"][key] = str(utt_ppl) writer["utt2ntokens"][key] = str(ntoken) if log_base is None: ppl = np.exp(total_nll / total_ntokens) else: ppl = log_base ** (total_nll / total_ntokens / np.log(log_base)) with (Path(output_dir) / "ppl").open("w", encoding="utf-8") as f: f.write(f"{ppl}\n") with (Path(output_dir) / "base").open("w", encoding="utf-8") as f: if log_base is None: _log_base = np.e else: _log_base = log_base f.write(f"{_log_base}\n") logging.info(f"PPL={ppl}")
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], inference_config: Optional[str], allow_variable_data_keys: bool, segment_size: Optional[float], hop_size: Optional[float], normalize_segment_scale: bool, show_progressbar: bool, ref_channel: Optional[int], normalize_output_wav: bool, enh_s2t_task: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build separate_speech separate_speech_kwargs = dict( train_config=train_config, model_file=model_file, inference_config=inference_config, segment_size=segment_size, hop_size=hop_size, normalize_segment_scale=normalize_segment_scale, show_progressbar=show_progressbar, ref_channel=ref_channel, normalize_output_wav=normalize_output_wav, device=device, dtype=dtype, enh_s2t_task=enh_s2t_task, ) separate_speech = SeparateSpeech.from_pretrained( model_tag=model_tag, **separate_speech_kwargs, ) # 3. Build data-iterator loader = EnhancementTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=EnhancementTask.build_preprocess_fn( separate_speech.enh_train_args, False), collate_fn=EnhancementTask.build_collate_fn( separate_speech.enh_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop output_dir = Path(output_dir).expanduser().resolve() writers = [] for i in range(separate_speech.num_spk): writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for i, (keys, batch) in enumerate(loader): logging.info(f"[{i}] Enhancing {keys}") assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} waves = separate_speech(**batch) for (spk, w) in enumerate(waves): for b in range(batch_size): writers[spk][keys[b]] = fs, w[b] for writer in writers: writer.close()
def inference( output_dir: str, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, lm_weight: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], asr_train_config: Optional[str], asr_model_file: Optional[str], beam_search_config: Optional[dict], lm_train_config: Optional[str], lm_file: Optional[str], model_tag: Optional[str], token_type: Optional[str], bpemodel: Optional[str], key_file: Optional[str], allow_variable_data_keys: bool, quantize_asr_model: Optional[bool], quantize_modules: Optional[List[str]], quantize_dtype: Optional[str], streaming: Optional[bool], chunk_size: Optional[int], left_context: Optional[int], right_context: Optional[int], display_partial_hypotheses: bool, ) -> None: """Transducer model inference. Args: output_dir: Output directory path. batch_size: Batch decoding size. dtype: Data type. beam_size: Beam size. ngpu: Number of GPUs. seed: Random number generator seed. lm_weight: Weight of language model. nbest: Number of final hypothesis. num_workers: Number of workers. log_level: Level of verbose for logs. data_path_and_name_and_type: asr_train_config: ASR model training config path. asr_model_file: ASR model path. beam_search_config: Beam search config path. lm_train_config: Language Model training config path. lm_file: Language Model path. model_tag: Model tag. token_type: Type of token units. bpemodel: BPE model path. key_file: File key. allow_variable_data_keys: Whether to allow variable data keys. quantize_asr_model: Whether to apply dynamic quantization to ASR model. quantize_modules: List of module names to apply dynamic quantization on. quantize_dtype: Dynamic quantization data type. streaming: Whether to perform chunk-by-chunk inference. chunk_size: Number of frames in chunk AFTER subsampling. left_context: Number of frames in left context AFTER subsampling. right_context: Number of frames in right context AFTER subsampling. display_partial_hypotheses: Whether to display partial hypotheses. """ assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, asr_model_file=asr_model_file, beam_search_config=beam_search_config, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, dtype=dtype, beam_size=beam_size, lm_weight=lm_weight, nbest=nbest, quantize_asr_model=quantize_asr_model, quantize_modules=quantize_modules, quantize_dtype=quantize_dtype, streaming=streaming, chunk_size=chunk_size, left_context=left_context, right_context=right_context, ) speech2text = Speech2Text.from_pretrained( model_tag=model_tag, **speech2text_kwargs, ) # 3. Build data-iterator loader = ASRTransducerTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTransducerTask.build_preprocess_fn( speech2text.asr_train_args, False), collate_fn=ASRTransducerTask.build_collate_fn( speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4 .Start for-loop with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } assert len(batch.keys()) == 1 try: if speech2text.streaming: speech = batch["speech"] _steps = len(speech) // speech2text._raw_ctx _end = 0 for i in range(_steps): _end = (i + 1) * speech2text._raw_ctx speech2text.streaming_decode( speech[i * speech2text._raw_ctx:_end], is_final=False) final_hyps = speech2text.streaming_decode( speech[_end:len(speech)], is_final=True) else: final_hyps = speech2text(**batch) results = speech2text.hypotheses_to_results(final_hyps) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, yseq=[], dec_state=None) results = [[" ", ["<space>"], [2], hyp]] * nbest key = keys[0] for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): ibest_writer = writer[f"{n}best_recog"] ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text