def __init__( self, fs: Union[int, str] = 16000, frontend_conf: Optional[dict] = get_default_kwargs(Frontend), download_dir: str = None, multilayer_feature: bool = False, ): assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) if download_dir is not None: torch.hub.set_dir(download_dir) self.multilayer_feature = multilayer_feature self.upstream, self.featurizer = self._get_upstream(frontend_conf) if getattr( self.upstream, "model", None ) is not None and self.upstream.model.__class__.__name__ in [ "Wav2Vec2Model", "HuberModel", ]: self.upstream.model.encoder.layerdrop = 0.0 self.pretrained_params = copy.deepcopy(self.upstream.state_dict()) self.output_dim = self.featurizer.output_dim
def enh_s2t_config_file(tmp_path: Path, token_list): # Write default configuration file EnhS2TTask.main( cmd=[ "--dry_run", "true", "--output_dir", str(tmp_path / "enh_s2t"), "--token_list", str(token_list), "--token_type", "char", ] ) with open(tmp_path / "enh_s2t" / "config.yaml", "r") as f: args = yaml.safe_load(f) if args["enh_encoder"] == "stft" and len(args["enh_encoder_conf"]) == 0: args["enh_encoder_conf"] = get_default_kwargs(STFTEncoder) with open(tmp_path / "enh_s2t" / "config.yaml", "w") as f: yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False) return tmp_path / "enh_s2t" / "config.yaml"
def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--odim", type=int_or_none, default=None, help="The number of dimension of output feature", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetTTSModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=False, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def __init__( self, fs: Union[int, str] = 16000, n_fft: int = 512, win_length: int = None, hop_length: int = 128, window: Optional[str] = "hann", center: bool = True, normalized: bool = False, onesided: bool = True, n_mels: int = 80, fmin: int = None, fmax: int = None, htk: bool = False, frontend_conf: Optional[dict] = get_default_kwargs(Frontend), apply_stft: bool = True, ): assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) # Deepcopy (In general, dict shouldn't be used as default arg) frontend_conf = copy.deepcopy(frontend_conf) self.hop_length = hop_length if apply_stft: self.stft = Stft( n_fft=n_fft, win_length=win_length, hop_length=hop_length, center=center, window=window, normalized=normalized, onesided=onesided, ) else: self.stft = None self.apply_stft = apply_stft if frontend_conf is not None: self.frontend = Frontend(idim=n_fft // 2 + 1, **frontend_conf) else: self.frontend = None self.logmel = LogMel( fs=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, ) self.n_mels = n_mels self.frontend_type = "default"
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as # required = parser.get_default("required") group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetEnhancementModel), help="The keyword arguments for model class.", ) group.add_argument( "--criterions", action=NestedDictAction, default=[ { "name": "si_snr", "conf": {}, "wrapper": "fixed_order", "wrapper_conf": {}, }, ], help="The criterions binded with the loss wrappers.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=False, help="Apply preprocessing to data or not", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") group.add_argument( "--num_spk", type=int_or_none, default=None, help= "The number fo speakers (for each recording) used in system training", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetDiarizationModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def __init__( self, fs: Union[int, str] = 16000, frontend_conf: Optional[dict] = get_default_kwargs(Frontend), download_dir: str = None, multilayer_feature: bool = False, ): try: import s3prl from s3prl.nn import Featurizer, S3PRLUpstream except Exception as e: print("Error: S3PRL is not properly installed.") print( "Please install S3PRL: cd ${MAIN_ROOT}/tools && make s3prl.done" ) raise e assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) if fs != 16000: logging.warning( "All the upstream models in S3PRL now only support 16 kHz audio." ) if download_dir is not None: s3prl.util.download.set_dir(download_dir) assert frontend_conf.get("upstream", None) in S3PRLUpstream.available_names() upstream = S3PRLUpstream( frontend_conf.get("upstream"), path_or_url=frontend_conf.get("path_or_url", None), ) upstream.eval() if getattr(upstream, "model", None) is not None and upstream.model.__class__.__name__ in [ "Wav2Vec2Model", "HubertModel", ]: upstream.model.encoder.layerdrop = 0.0 featurizer = Featurizer(upstream) self.multilayer_feature = multilayer_feature self.upstream, self.featurizer = upstream, featurizer self.pretrained_params = copy.deepcopy(self.upstream.state_dict()) self.frontend_type = "s3prl" self.hop_length = self.featurizer.downsample_rate self.tile_factor = frontend_conf.get("tile_factor", 1)
def __init__( self, fs: Union[int, str] = 16000, frontend_conf: Optional[dict] = get_default_kwargs(Frontend), download_dir: str = None, multilayer_feature: bool = False, ): assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) if download_dir is not None: torch.hub.set_dir(download_dir) self.multilayer_feature = multilayer_feature self.upstream, self.featurizer = self._get_upstream(frontend_conf) self.pretrained_params = copy.deepcopy(self.upstream.state_dict()) self.output_dim = self.featurizer.output_dim
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(HynetImgrModel), help="The keyword arguments for model class.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--ctc_conf", action=NestedDictAction, default=get_default_kwargs(CTC), help="The keyword arguments for CTC class.", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetASRModel), help="The keyword arguments for model class.", ) group.add_argument( "--use_curriculum", type=str, default=False, help="Use curriculum learning algorithms.", ) group.add_argument( "--curriculum_algo", type=str, default="exp3s", help="MAB algorithm to use for curriculum learning", ) group.add_argument( "--gain_type", type=str, default="PG", help="Loss-based gain for Curriculum Learning. Prediction Gain (PG) or Self-Prediction Gain (SPG)", ) group.add_argument("--refill_task", type=bool, default=1, help="Refill task after the task is exhausted") group.add_argument("--gen_log_dir", type=str, default='curriculum_log', help="Directory to store curriculum generator logs") group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=[None, "g2p_en", "pyopenjtalk", "pyopenjtalk_kana"], default=None, help="Specify g2p method if --token_type=phn", ) parser.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Scale the maximum amplitude to the given value.", ) parser.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) parser.add_argument( "--rir_apply_prob", type=float, default=1.0, help="THe probability for applying RIR convolution.", ) parser.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) parser.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) parser.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of noise decibel level.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--src_token_list", type=str_or_none, default=None, help="A text mapping int-id to token (for source language)", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--ctc_conf", action=NestedDictAction, default=get_default_kwargs(CTC), help="The keyword arguments for CTC class.", ) group.add_argument( "--enh_criterions", action=NestedDictAction, default=[ { "name": "si_snr", "conf": {}, "wrapper": "fixed_order", "wrapper_conf": {}, }, ], help="The criterions binded with the loss wrappers.", ) group.add_argument( "--diar_num_spk", type=int_or_none, default=None, help="The number of speakers (for each recording) for diar submodel class", ) group.add_argument( "--diar_input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--enh_model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetEnhancementModel), help="The keyword arguments for enh submodel class.", ) group.add_argument( "--asr_model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetASRModel), help="The keyword arguments for asr submodel class.", ) group.add_argument( "--st_model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetEnhancementModel), help="The keyword arguments for st submodel class.", ) group.add_argument( "--diar_model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetDiarizationModel), help="The keyword arguments for diar submodel class.", ) group.add_argument( "--subtask_series", type=str, nargs="+", default=("enh", "asr"), choices=["enh", "asr", "st", "diar"], help="The series of subtasks in the pipeline.", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetEnhS2TModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=False, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) group.add_argument( "--src_token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The source text will be tokenized " "in the specified level token", ) group.add_argument( "--src_bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece (for source language)", ) group.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) group.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) group.add_argument( "--g2p", type=str_or_none, choices=g2p_choices, default=None, help="Specify g2p method if --token_type=phn", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") required = parser.get_default("required") required += ["token_list"] group.add_argument("--train_pseudo_shape_file", type=str, action="append", default=[]) group.add_argument( "--train_pseudo_data_path_and_name_and_type", type=str2triple_str, action="append", default=[], help= "Give three words splitted by comma. It's used for the training data.", ) group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--pis_ratio", type=float, default=1, help="The multi-iteration ratio for SSL", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--ctc_conf", action=NestedDictAction, default=get_default_kwargs(CTC), help="The keyword arguments for CTC class.", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetASRModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=[None, "g2p_en", "pyopenjtalk", "pyopenjtalk_kana"], default=None, help="Specify g2p method if --token_type=phn", ) parser.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Scale the maximum amplitude to the given value.", ) parser.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) parser.add_argument( "--rir_apply_prob", type=float, default=1.0, help="THe probability for applying RIR convolution.", ) parser.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) parser.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) parser.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of noise decibel level.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def __init__( self, vocab_size: int, encoder_output_size: int, rnn_type: str = "lstm", num_layers: int = 1, hidden_size: int = 320, sampling_probability: float = 0.0, dropout: float = 0.0, context_residual: bool = False, replace_sos: bool = False, num_encs: int = 1, att_conf: dict = get_default_kwargs(build_attention_list), ): # FIXME(kamo): The parts of num_spk should be refactored more more more assert check_argument_types() if rnn_type not in {"lstm", "gru"}: raise ValueError(f"Not supported: rnn_type={rnn_type}") super().__init__() eprojs = encoder_output_size self.encoder_output_size = eprojs self.dtype = rnn_type self.dunits = hidden_size self.dlayers = num_layers self.context_residual = context_residual self.sos = vocab_size - 1 self.eos = vocab_size - 1 self.odim = vocab_size self.sampling_probability = sampling_probability self.dropout = dropout self.num_encs = num_encs # for multilingual translation self.replace_sos = replace_sos self.embed = torch.nn.Embedding(vocab_size, hidden_size) self.dropout_emb = torch.nn.Dropout(p=dropout) self.decoder = torch.nn.ModuleList() self.dropout_dec = torch.nn.ModuleList() self.decoder += [ torch.nn.LSTMCell(hidden_size + eprojs, hidden_size) if self.dtype == "lstm" else torch.nn.GRUCell(hidden_size + eprojs, hidden_size) ] self.dropout_dec += [torch.nn.Dropout(p=dropout)] for _ in range(1, self.dlayers): self.decoder += [ torch.nn.LSTMCell(hidden_size, hidden_size) if self.dtype == "lstm" else torch.nn.GRUCell(hidden_size, hidden_size) ] self.dropout_dec += [torch.nn.Dropout(p=dropout)] # NOTE: dropout is applied only for the vertical connections # see https://arxiv.org/pdf/1409.2329.pdf if context_residual: self.output = torch.nn.Linear(hidden_size + eprojs, vocab_size) else: self.output = torch.nn.Linear(hidden_size, vocab_size) self.att_list = build_attention_list(eprojs=eprojs, dunits=hidden_size, **att_conf)
def get_parser(): """Get argument parser.""" parser = config_argparse.ArgumentParser( description="TTS Decode", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) # Note(kamo): Use "_" instead of "-" as separator. # "-" is confusing if written in yaml. parser.add_argument( "--log_level", type=lambda x: x.upper(), default="INFO", choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), help="The verbose level of logging", ) parser.add_argument( "--output_dir", type=str, required=True, help="The path of output directory", ) parser.add_argument( "--ngpu", type=int, default=0, help="The number of gpus. 0 indicates CPU mode", ) parser.add_argument( "--seed", type=int, default=0, help="Random seed", ) parser.add_argument( "--dtype", default="float32", choices=["float16", "float32", "float64"], help="Data type", ) parser.add_argument( "--num_workers", type=int, default=1, help="The number of workers used for DataLoader", ) parser.add_argument( "--batch_size", type=int, default=1, help="The batch size for inference", ) group = parser.add_argument_group("Input data related") group.add_argument( "--data_path_and_name_and_type", type=str2triple_str, required=True, action="append", ) group.add_argument( "--key_file", type=str_or_none, ) group.add_argument( "--allow_variable_data_keys", type=str2bool, default=False, ) group = parser.add_argument_group("The model configuration related") group.add_argument( "--train_config", type=str, help="Training configuration file.", ) group.add_argument( "--model_file", type=str, help="Model parameter file.", ) group = parser.add_argument_group("Decoding related") group.add_argument( "--maxlenratio", type=float, default=10.0, help="Maximum length ratio in decoding", ) group.add_argument( "--minlenratio", type=float, default=0.0, help="Minimum length ratio in decoding", ) group.add_argument( "--threshold", type=float, default=0.5, help="Threshold value in decoding", ) group.add_argument( "--use_att_constraint", type=str2bool, default=False, help="Whether to use attention constraint", ) group.add_argument( "--backward_window", type=int, default=1, help="Backward window value in attention constraint", ) group.add_argument( "--forward_window", type=int, default=3, help="Forward window value in attention constraint", ) group.add_argument( "--use_teacher_forcing", type=str2bool, default=False, help="Whether to use teacher forcing", ) parser.add_argument( "--speed_control_alpha", type=float, default=1.0, help="Alpha in FastSpeech to change the speed of generated speech", ) group = parser.add_argument_group("Grriffin-Lim related") group.add_argument( "--vocoder_conf", action=NestedDictAction, default=get_default_kwargs(Spectrogram2Waveform), help="The configuration for Grriffin-Lim", ) return parser
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["src_token_list", "token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token (for target language)", ) group.add_argument( "--src_token_list", type=str_or_none, default=None, help="A text mapping int-id to token (for source language)", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--ctc_conf", action=NestedDictAction, default=get_default_kwargs(CTC), help="The keyword arguments for CTC class.", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetSTModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The target text will be tokenized " "in the specified level token", ) group.add_argument( "--src_token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The source text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece (for target language)", ) group.add_argument( "--src_bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece (for source language)", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=g2p_choices, default=None, help="Specify g2p method if --token_type=phn", ) parser.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Scale the maximum amplitude to the given value.", ) parser.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) parser.add_argument( "--rir_apply_prob", type=float, default=1.0, help="THe probability for applying RIR convolution.", ) parser.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) parser.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) parser.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of noise decibel level.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A train mapping int-id to token", ) group.add_argument( "--odim", type=int_or_none, default=None, help="The number of dimension of output feature", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetTTSModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="phn", choices=["bpe", "char", "word", "phn"], help="The train will be tokenized in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply train cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=[ None, "g2p_en", "g2p_en_no_space", "pyopenjtalk", "pyopenjtalk_kana", "pyopenjtalk_accent", "pyopenjtalk_accent_with_pause", "pypinyin_g2p", "pypinyin_g2p_phone", "espeak_ng_arabic", ], default=None, help="Specify g2p method if --token_type=phn", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def test_get_defaut_kwargs(func, desired: Any): assert get_default_kwargs(func) == desired
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(CepNet), help="The keyword arguments for model class.", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) parser.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Scale the maximum amplitude to the given value.", ) parser.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) parser.add_argument( "--rir_apply_prob", type=float, default=1.0, help="THe probability for applying RIR convolution.", ) parser.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) parser.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) parser.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of noise decibel level.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetLanguageModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word"], help="", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file fo sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=[ None, "g2p_en", "g2p_en_no_space", "pyopenjtalk", "pyopenjtalk_kana", "pyopenjtalk_accent", "pyopenjtalk_accent_with_pause", "pypinyin_g2p", "pypinyin_g2p_phone", "espeak_ng_arabic", "espeak_ng_german", "espeak_ng_french", "espeak_ng_spanish", "espeak_ng_russian", "g2pk", "g2pk_no_space", ], default=None, help="Specify g2p method if --token_type=phn", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group) assert check_return_type(parser) return parser
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--ctc_conf", action=NestedDictAction, default=get_default_kwargs(CTC), help="The keyword arguments for CTC class.", ) group.add_argument( "--asr_model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetASRModel), help="The keyword arguments for model class.", ) group.add_argument( "--enh_model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetEnhancementModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=False, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=[None, "g2p_en", "pyopenjtalk", "pyopenjtalk_kana"], default=None, help="Specify g2p method if --token_type=phn", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as # required = parser.get_default("required") group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetEnhancementModel), help="The keyword arguments for model class.", ) group.add_argument( "--criterions", action=NestedDictAction, default=[ { "name": "si_snr", "conf": {}, "wrapper": "fixed_order", "wrapper_conf": {}, }, ], help="The criterions binded with the loss wrappers.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=False, help="Whether to apply preprocessing to data or not", ) group.add_argument( "--speech_volume_normalize", type=str_or_none, default=None, help="Scale the maximum amplitude to the given value or range. " "e.g. --speech_volume_normalize 1.0 scales it to 1.0.\n" "--speech_volume_normalize 0.5_1.0 scales it to a random number in " "the range [0.5, 1.0)", ) group.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) group.add_argument( "--rir_apply_prob", type=float, default=1.0, help="THe probability for applying RIR convolution.", ) group.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) group.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) group.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of signal-to-noise ratio (SNR) level in decibel.", ) group.add_argument( "--short_noise_thres", type=float, default=0.5, help= "If len(noise) / len(speech) is smaller than this threshold during " "dynamic mixing, a warning will be displayed.", ) group.add_argument( "--use_reverberant_ref", type=str2bool, default=False, help="Whether to use reverberant speech references " "instead of anechoic ones", ) group.add_argument( "--num_spk", type=int, default=1, help="Number of speakers in the input signal.", ) group.add_argument( "--num_noise_type", type=int, default=1, help="Number of noise types.", ) group.add_argument( "--sample_rate", type=int, default=8000, help="Sampling rate of the data (in Hz).", ) group.add_argument( "--force_single_channel", type=str2bool, default=False, help="Whether to force all data to be single-channel.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetLanguageModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=False, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word"], help="", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file fo sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group) assert check_return_type(parser) return parser
def add_task_arguments(cls, parser: argparse.ArgumentParser): """Add Transducer task arguments. Args: cls: ASRTransducerTask object. parser: Transducer arguments parser. """ group = parser.add_argument_group(description="Task related") required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--input_size", type=int_or_none, default=None, help="The number of input dimension of the feature", ) group.add_argument( "--init", type=lambda x: str_or_none(x.lower()), default=None, help="The initialization method", choices=[ "chainer", "chainer_espnet1", "xavier_uniform", "xavier_normal", "kaiming_uniform", "kaiming_normal", None, ], ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetASRTransducerModel), help="The keyword arguments for model class.", ) group.add_argument( "--encoder_conf", action=NestedDictAction, default={}, help="The keyword arguments for encoder class.", ) group.add_argument( "--joint_network_conf", action=NestedDictAction, default={}, help="The keyword arguments for joint network class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="bpe", choices=["bpe", "char", "word", "phn"], help="The text will be tokenized " "in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=g2p_choices, default=None, help="Specify g2p method if --token_type=phn", ) parser.add_argument( "--speech_volume_normalize", type=float_or_none, default=None, help="Scale the maximum amplitude to the given value.", ) parser.add_argument( "--rir_scp", type=str_or_none, default=None, help="The file path of rir scp file.", ) parser.add_argument( "--rir_apply_prob", type=float, default=1.0, help="The probability for applying RIR convolution.", ) parser.add_argument( "--noise_scp", type=str_or_none, default=None, help="The file path of noise scp file.", ) parser.add_argument( "--noise_apply_prob", type=float, default=1.0, help="The probability applying Noise adding.", ) parser.add_argument( "--noise_db_range", type=str, default="13_15", help="The range of noise decibel level.", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --decoder and --decoder_conf class_choices.add_arguments(group)