def build_vocoder_from_file( cls, vocoder_config_file: Union[Path, str] = None, vocoder_file: Union[Path, str] = None, model: Optional[ESPnetTTSModel] = None, device: str = "cpu", ): # Build vocoder if vocoder_file is None: # If vocoder file is not provided, use griffin-lim as a vocoder vocoder_conf = {} if vocoder_config_file is not None: vocoder_config_file = Path(vocoder_config_file) with vocoder_config_file.open("r", encoding="utf-8") as f: vocoder_conf = yaml.safe_load(f) if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if ("n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf): return Spectrogram2Waveform(**vocoder_conf) else: logging.warning( "Vocoder is not available. Skipped its building.") return None elif str(vocoder_file).endswith(".pkl"): # If the extension is ".pkl", the model is trained with parallel_wavegan vocoder = ParallelWaveGANPretrainedVocoder(vocoder_file, vocoder_config_file) return vocoder.to(device) else: raise ValueError(f"{vocoder_file} is not supported format.")
def build_vocoder_from_file( cls, vocoder_config_file: Union[Path, str] = None, vocoder_file: Union[Path, str] = None, model: Optional[ESPnetTTSModel] = None, device: str = "cpu", ): # Build vocoder if vocoder_file is None: # If vocoder file is not provided, use griffin-lim as a vocoder vocoder_conf = {} if vocoder_config_file is not None: vocoder_config_file = Path(vocoder_config_file) with vocoder_config_file.open("r", encoding="utf-8") as f: vocoder_conf = yaml.safe_load(f) if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if ("n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf): return Spectrogram2Waveform(**vocoder_conf) else: logging.warning( "Vocoder is not available. Skipped its building.") return None elif not Path(vocoder_file).exists(): # Assume that vocoder file is the tag of pretrained model try: from parallel_wavegan.utils import download_pretrained_model except ImportError: logging.error( "`parallel_wavegan` is not installed. " "Please install via `pip install -U parallel_wavegan`.") raise from parallel_wavegan import __version__ # NOTE(kan-bayashi): Filelock download is supported from 0.5.2 assert LooseVersion(__version__) > LooseVersion("0.5.1"), ( "Please install the latest parallel_wavegan " "via `pip install -U parallel_wavegan`.") logging.info( f"{vocoder_file} does not exist. " f"We assume that {vocoder_file} is tag of the pretrained model." ) vocoder = ParallelWaveGANPretrainedVocoder( download_pretrained_model(vocoder_file)) return vocoder.to(device) elif str(vocoder_file).endswith(".pkl"): # If the extension is ".pkl", the model is trained with parallel_wavegan vocoder = ParallelWaveGANPretrainedVocoder(vocoder_file, vocoder_config_file) return vocoder.to(device) else: raise ValueError(f"{vocoder_file} is not supported format.")
def __init__( self, train_config: Optional[Union[Path, str]], model_file: Optional[Union[Path, str]] = None, threshold: float = 0.5, minlenratio: float = 0.0, maxlenratio: float = 10.0, use_teacher_forcing: bool = False, use_att_constraint: bool = False, backward_window: int = 1, forward_window: int = 3, speed_control_alpha: float = 1.0, vocoder_conf: dict = None, dtype: str = "float32", device: str = "cpu", ): assert check_argument_types() model, train_args = TTSTask.build_model_from_file( train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() self.device = device self.dtype = dtype self.train_args = train_args self.model = model self.tts = model.tts self.normalize = model.normalize self.feats_extract = model.feats_extract self.duration_calculator = DurationCalculator() self.preprocess_fn = TTSTask.build_preprocess_fn(train_args, False) self.use_teacher_forcing = use_teacher_forcing logging.info(f"Normalization:\n{self.normalize}") logging.info(f"TTS:\n{self.tts}") decode_config = {} if isinstance(self.tts, (Tacotron2, Transformer)): decode_config.update({ "threshold": threshold, "maxlenratio": maxlenratio, "minlenratio": minlenratio, }) if isinstance(self.tts, Tacotron2): decode_config.update({ "use_att_constraint": use_att_constraint, "forward_window": forward_window, "backward_window": backward_window, }) if isinstance(self.tts, (FastSpeech, FastSpeech2)): decode_config.update({"alpha": speed_control_alpha}) decode_config.update({"use_teacher_forcing": use_teacher_forcing}) self.decode_config = decode_config if vocoder_conf is None: vocoder_conf = {} if self.feats_extract is not None: vocoder_conf.update(self.feats_extract.get_parameters()) if ("n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf): self.spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {self.spc2wav}") else: self.spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient")
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_att_constraint: bool, backward_window: int, forward_window: int, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model model, train_args = TTSTask.build_model_from_file(train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() tts = model.tts normalize = model.normalize logging.info(f"Normalization:\n{normalize}") logging.info(f"TTS:\n{tts}") # 3. Build data-iterator loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(train_args, False), collate_fn=TTSTask.build_collate_fn(train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Build converter from spectrogram to waveform if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf: spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {spc2wav}") else: spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient") # 5. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) (output_dir / "att_ws").mkdir(parents=True, exist_ok=True) (output_dir / "probs").mkdir(parents=True, exist_ok=True) with NpyScpWriter( output_dir / "norm", output_dir / "norm/feats.scp", ) as f, NpyScpWriter(output_dir / "denorm", output_dir / "denorm/feats.scp") as g: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = to_device(batch, device) key = keys[0] # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. _data = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() _decode_conf = { "threshold": threshold, "maxlenratio": maxlenratio, "minlenratio": minlenratio, } if isinstance(tts, Tacotron2): _decode_conf.update({ "use_att_constraint": use_att_constraint, "forward_window": forward_window, "backward_window": backward_window, }) outs, probs, att_ws = tts.inference(**_data, **_decode_conf) insize = next(iter(_data.values())).size(0) + 1 logging.info("inference speed = {:.1f} frames / sec.".format( int(outs.size(0)) / (time.perf_counter() - start_time))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") f[key] = outs.cpu().numpy() # NOTE: normalize.inverse is in-place operation outs_denorm = normalize.inverse(outs[None])[0][0] g[key] = outs_denorm.cpu().numpy() # Lazy load to avoid the backend error matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator # Plot attention weight att_ws = att_ws.cpu().numpy() if att_ws.ndim == 2: att_ws = att_ws[None][None] elif att_ws.ndim != 4: raise RuntimeError(f"Must be 2 or 4 dimension: {att_ws.ndim}") w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1]) fig = plt.Figure(figsize=( w * 1.3 * min(att_ws.shape[0], 2.5), h * 1.3 * min(att_ws.shape[1], 2.5), )) fig.suptitle(f"{key}") axes = fig.subplots(att_ws.shape[0], att_ws.shape[1]) if len(att_ws) == 1: axes = [[axes]] for ax, att_w in zip(axes, att_ws): for ax_, att_w_ in zip(ax, att_w): ax_.imshow(att_w_.astype(np.float32), aspect="auto") ax_.set_xlabel("Input") ax_.set_ylabel("Output") ax_.xaxis.set_major_locator(MaxNLocator(integer=True)) ax_.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.tight_layout(rect=[0, 0.03, 1, 0.95]) fig.savefig(output_dir / f"att_ws/{key}.png") fig.clf() # Plot stop token prediction probs = probs.cpu().numpy() fig = plt.Figure() ax = fig.add_subplot(1, 1, 1) ax.plot(probs) ax.set_title(f"{key}") ax.set_xlabel("Output") ax.set_ylabel("Stop probability") ax.set_ylim(0, 1) ax.grid(which="both") fig.tight_layout() fig.savefig(output_dir / f"probs/{key}.png") fig.clf() # TODO(kamo): Write scp if spc2wav is not None: wav = spc2wav(outs_denorm.cpu().numpy()) sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], threshold: float, minlenratio: float, maxlenratio: float, use_att_constraint: bool, backward_window: int, forward_window: int, allow_variable_data_keys: bool, vocoder_conf: dict, ): """Perform TTS model decoding.""" assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build model model, train_args = TTSTask.build_model_from_file(train_config, model_file, device) model.to(dtype=getattr(torch, dtype)).eval() tts = model.tts normalize = model.normalize logging.info(f"Normalization:\n{normalize}") logging.info(f"TTS:\n{tts}") # 3. Build data-iterator loader = TTSTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=TTSTask.build_preprocess_fn(train_args, False), collate_fn=TTSTask.build_collate_fn(train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Build converter from spectrogram to waveform if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if "n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf: spc2wav = Spectrogram2Waveform(**vocoder_conf) logging.info(f"Vocoder: {spc2wav}") else: spc2wav = None logging.info( "Vocoder is not used because vocoder_conf is not sufficient") # 5. Start for-loop output_dir = Path(output_dir) (output_dir / "norm").mkdir(parents=True, exist_ok=True) (output_dir / "denorm").mkdir(parents=True, exist_ok=True) (output_dir / "wav").mkdir(parents=True, exist_ok=True) # FIXME(kamo): I think we shouldn't depend on kaldi-format any more. # How about numpy or HDF5? # >>> with NpyScpWriter() as f: with kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format( o=output_dir / "norm/feats")) as f, kaldiio.WriteHelper( "ark,scp:{o}.ark,{o}.scp".format(o=output_dir / "denorm/feats")) as g: for idx, (keys, batch) in enumerate(loader, 1): assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = to_device(batch, device) key = keys[0] # Change to single sequence and remove *_length # because inference() requires 1-seq, not mini-batch. _data = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } start_time = time.perf_counter() # TODO(kamo): Now att_ws is not used. outs, probs, att_ws = tts.inference( **_data, threshold=threshold, maxlenratio=maxlenratio, minlenratio=minlenratio, ) outs_denorm = normalize.inverse(outs[None])[0][0] insize = next(iter(_data.values())).size(0) logging.info("inference speed = {} msec / frame.".format( (time.perf_counter() - start_time) / (int(outs.size(0)) * 1000))) logging.info(f"{key} (size:{insize}->{outs.size(0)})") if outs.size(0) == insize * maxlenratio: logging.warning( f"output length reaches maximum length ({key}).") f[key] = outs.cpu().numpy() g[key] = outs_denorm.cpu().numpy() # TODO(kamo): Write scp if spc2wav is not None: wav = spc2wav(outs_denorm.cpu().numpy()) sf.write(f"{output_dir}/wav/{key}.wav", wav, spc2wav.fs, "PCM_16")