def __init__(self, name: str, encoders: List[TemporalStateful], vocabulary: Vocabulary, data_id: str, max_output_len: int = None, hidden_dim: int = None, activation: Callable = tf.nn.relu, dropout_keep_prob: float = 1.0, add_start_symbol: bool = False, add_end_symbol: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: check_argument_types() ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.encoders = encoders self.vocabulary = vocabulary self.data_id = data_id self.max_output_len = max_output_len self.hidden_dim = hidden_dim self.activation = activation self.dropout_keep_prob = dropout_keep_prob self.add_start_symbol = add_start_symbol self.add_end_symbol = add_end_symbol
def beam_search_runner_range( output_series: str, decoder: BeamSearchDecoder, max_rank: int = None, postprocess: Callable[[List[str]], List[str]] = None) -> List[ BeamSearchRunner]: """Return beam search runners for a range of ranks from 1 to max_rank. This means there is max_rank output series where the n-th series contains the n-th best hypothesis from the beam search. Args: output_series: Prefix of output series. decoder: Beam search decoder shared by all runners. max_rank: Maximum rank of the hypotheses. postprocess: Series-level postprocess applied on output. Returns: List of beam search runners getting hypotheses with rank from 1 to max_rank. """ check_argument_types() if max_rank is None: max_rank = decoder.beam_size if max_rank > decoder.beam_size: raise ValueError( ("The maximum rank ({}) cannot be " "bigger than beam size {}.").format( max_rank, decoder.beam_size)) return [BeamSearchRunner("{}.rank{:03d}".format(output_series, r), decoder, r, postprocess) for r in range(1, max_rank + 1)]
def __init__(self, name: str, input_sequence: Attendable, hidden_size: int, num_heads: int, output_size: int = None, state_proj_size: int = None, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Initialize an instance of the encoder.""" check_argument_types() ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.input_sequence = input_sequence self.hidden_size = hidden_size self.num_heads = num_heads self.output_size = output_size self.state_proj_size = state_proj_size self.dropout_keep_prob = dropout_keep_prob if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0: raise ValueError("Dropout keep prob must be inside (0,1].")
def __init__(self, name: str, encoder: TemporalStateful, vocabulary: Vocabulary, data_id: str, max_length: int = None, merge_repeated_targets: bool = False, merge_repeated_outputs: bool = True, beam_width: int = 1, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: check_argument_types() ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.encoder = encoder self.vocabulary = vocabulary self.data_id = data_id self.max_length = max_length self.merge_repeated_targets = merge_repeated_targets self.merge_repeated_outputs = merge_repeated_outputs self.beam_width = beam_width
def __init__(self, name: str, encoders: List[TemporalStateful], embedded_sequence: EmbeddedSequence, data_id: str, max_output_len: int = None, hidden_dim: int = None, activation: Callable = tf.nn.relu, train_embeddings: bool = True, dropout_keep_prob: float = 1.0, add_start_symbol: bool = False, add_end_symbol: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: check_argument_types() SequenceLabeler.__init__( self, name, encoders, embedded_sequence.vocabulary, data_id, max_output_len, hidden_dim=hidden_dim, activation=activation, dropout_keep_prob=dropout_keep_prob, add_start_symbol=add_start_symbol, add_end_symbol=add_end_symbol, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) self.embedded_sequence = embedded_sequence self.train_embeddings = train_embeddings
def __init__(self, output_series: str, decoder: SupportedDecoder, postprocess: Postprocessor = None) -> None: check_argument_types() BaseRunner[SupportedDecoder].__init__(self, output_series, decoder) self.postprocess = postprocess
def __init__(self, decoders: List[Any], decoder_weights: List[ObjectiveWeight] = None, l1_weight: float = 0., l2_weight: float = 0., clip_norm: float = None, optimizer: tf.train.Optimizer = None, var_scopes: List[str] = None, var_collection: str = None) -> None: check_argument_types() if decoder_weights is None: decoder_weights = [None for _ in decoders] if len(decoder_weights) != len(decoders): raise ValueError( "decoder_weights (length {}) do not match decoders (length {})" .format(len(decoder_weights), len(decoders))) objectives = [CostObjective(dec, w) for dec, w in zip(decoders, decoder_weights)] GenericTrainer.__init__( self, objectives=objectives, l1_weight=l1_weight, l2_weight=l2_weight, clip_norm=clip_norm, optimizer=optimizer, var_scopes=var_scopes, var_collection=var_collection)
def pooling( prev_layer: tf.Tensor, prev_mask: tf.Tensor, specification: MaxPoolSpec, layer_num: int) -> Tuple[tf.Tensor, tf.Tensor]: try: check_argument_types() except TypeError as err: raise ValueError(( "Specification of a max-pooling layer (number {} in config) " 'needs to have 3 members: "M", pool size, stride, padding, ' "was {}").format(layer_num, specification)) from err pool_type, pool_size, stride, pad = specification if pool_type == "M": pool_fn = tf.layers.max_pooling2d elif pool_type == "A": pool_fn = tf.layers.average_pooling2d else: raise ValueError( ("Unsupported type of pooling: {}, use 'M' for max-pooling or " "'A' for average pooling.").format(pool_type)) if pad not in ["same", "valid"]: raise ValueError( "Padding must be 'same' or 'valid', was '{}' in layer {}." .format(pad, layer_num + 1)) with tf.variable_scope("layer_{}_max_pool".format(layer_num)): next_layer = pool_fn(prev_layer, pool_size, stride) next_mask = tf.layers.max_pooling2d(prev_mask, pool_size, stride) return next_layer, next_mask
def __init__(self, name: str, cnn: CNNEncoder) -> None: check_argument_types() ModelPart.__init__( self, name, save_checkpoint=None, load_checkpoint=None) self._cnn = cnn
def word2vec_vocabulary(w2v: Word2Vec) -> Vocabulary: """Return the vocabulary from a word2vec object. This is a helper method used from configuration. """ check_argument_types() return w2v.vocabulary
def plain_convolution( prev_layer: tf.Tensor, prev_mask: tf.Tensor, specification: ConvSpec, batch_norm_callback: Callable[[tf.Tensor], tf.Tensor], layer_num: int) -> Tuple[tf.Tensor, tf.Tensor, int]: try: check_argument_types() except TypeError as err: raise ValueError(( "Specification of a convolutional layer (number {} in config) " 'needs to have 5 members: "C", kernel size, stride, ' "padding, output channels, was {}").format( layer_num, specification)) from err kernel_size, stride, pad, out_channels = specification[1:] if pad not in ["same", "valid"]: raise ValueError( ("Padding must be 'same' or 'valid', " "was '{}' in layer {}.").format(pad, layer_num + 1)) with tf.variable_scope("layer_{}_convolution".format(layer_num)): next_layer = tf.layers.conv2d( prev_layer, out_channels, kernel_size, activation=None, padding=pad) next_layer = batch_norm_callback(next_layer) next_layer = tf.nn.relu(next_layer) next_mask = tf.layers.max_pooling2d( prev_mask, kernel_size, stride, padding=pad) return next_layer, next_mask, out_channels
def mlp_output(layer_sizes: List[int], activation: Callable[[tf.Tensor], tf.Tensor] = tf.tanh, dropout_keep_prob: float = 1.0) -> Tuple[OutputProjection, int]: """Apply a multilayer perceptron. Compute RNN deep output using the multilayer perceptron with a specified activation function. (Pascanu et al., 2013 [https://arxiv.org/pdf/1312.6026v5.pdf]) Arguments: layer_sizes: A list of sizes of the hiddel layers of the MLP dropout_keep_prob: the dropout keep probability activation: The activation function to use in each layer. """ check_argument_types() def _projection(prev_state, prev_output, ctx_tensors, train_mode): mlp_input = tf.concat([prev_state, prev_output] + ctx_tensors, 1) return multilayer_projection(mlp_input, layer_sizes, activation=activation, dropout_keep_prob=dropout_keep_prob, train_mode=train_mode, scope="deep_output_mlp") return _projection, layer_sizes[-1]
def __init__(self, wrapper: str, name: str = "MultEval", encoding: str = "utf-8", metric: str = "bleu", language: str = "en") -> None: """Initialize the wrapper. Arguments: wrapper: Path to multeval.sh script name: Name of the evaluator encoding: Encoding of input files language: Language of hypotheses and references metric: Evaluation metric "bleu", "ter", "meteor" """ check_argument_types() super().__init__("{}_{}_{}".format(name, metric, language)) self.wrapper = wrapper self.encoding = encoding self.language = language self.metric = metric if self.metric not in ["bleu", "ter", "meteor"]: warn("{} metric is not valid. Using bleu instead.". format(self.metric)) self.metric = "bleu"
def __init__(self, num_sessions: int, num_threads: int, save_n_best: int = 1, minimize_metric: bool = False, gpu_allow_growth: bool = True, per_process_gpu_memory_fraction: float = 1.0, enable_tf_debug: bool = False) -> None: """Initialize a TensorflowManager. At this moment the graph must already exist. This method initializes required number of TensorFlow sessions and initializes them with provided variable files if they are provided. Args: num_sessions: Number of sessions to be initialized. num_threads: Number of threads sessions will run in. save_n_best: How many best models to keep minimize_metric: Whether the best model is the one with the lowest or the highest score gpu_allow_growth: TF to allocate incrementally, not all at once. per_process_gpu_memory_fraction: Limit TF memory use. """ check_argument_types() self.session_cfg = tf.ConfigProto() self.session_cfg.inter_op_parallelism_threads = num_threads self.session_cfg.intra_op_parallelism_threads = num_threads self.session_cfg.allow_soft_placement = True # needed for more GPUs # pylint: disable=no-member self.session_cfg.gpu_options.allow_growth = gpu_allow_growth self.session_cfg.gpu_options.per_process_gpu_memory_fraction = \ per_process_gpu_memory_fraction # pylint: enable=no-member if save_n_best < 1: raise Exception("save_n_best parameter must be greater than zero") self.saver_max_to_keep = save_n_best self.minimize_metric = minimize_metric self.num_sessions = num_sessions self.sessions = [tf.Session(config=self.session_cfg) for _ in range(self.num_sessions)] if enable_tf_debug: self.sessions = [tf_debug.LocalCLIDebugWrapperSession(sess) for sess in self.sessions] self.saver = None self.best_score_index = None # type: Optional[int] self.best_score_epoch = 0 self.best_score_batch = 0 init_score = np.inf if self.minimize_metric else -np.inf self.saved_scores = [init_score for _ in range(self.saver_max_to_keep)] self.best_score = init_score self.variables_files = [] # type: List[str] self._best_vars_file = None # type: Optional[str]
def maxout_output( maxout_size: int, dropout_keep_prob: float = 1.0) -> Tuple[OutputProjection, int]: """Apply maxout. Compute RNN output out of the previous state and output, and the context tensors returned from attention mechanisms, as described in the article This function corresponds to the equations for computation the t_tilde in the Bahdanau et al. (2015) paper, on page 14, with the maxout projection, before the last linear projection. Arguments: maxout_size: The size of the hidden maxout layer in the deep output Returns: Returns the maxout projection of the concatenated inputs """ check_argument_types() def _projection(prev_state, prev_output, ctx_tensors, train_mode): state_out_ctx = tf.concat([prev_state, prev_output] + ctx_tensors, 1) return dropout( maxout(state_out_ctx, maxout_size), dropout_keep_prob, train_mode) return _projection, maxout_size
def from_file_list(prefix: str, shape: List[int], suffix: str = "", default_tensor_name: str = "arr_0") -> Callable: """Load a list of numpy arrays from a list of .npz numpy files. Args: prefix: A common prefix for the files in the list. shape: The shape of the numpy arrays stored in the referenced files. suffix: An optional suffix that will be appended to each path default_tensor_name: Key of the tensors to load from the npz files. Returns: A generator function that yields the loaded arryas. """ check_argument_types() def load(files: List[str]) -> Iterable[np.ndarray]: for list_file in files: with open(list_file, encoding="utf-8") as f_list: for line in f_list: path = os.path.join(prefix, line.rstrip()) + suffix with np.load(path) as npz: arr = npz[default_tensor_name] arr_shape = list(arr.shape) if arr_shape != shape: raise ValueError( "Shapes do not match: expected {}, found {}" .format(shape, arr_shape)) yield arr return load
def __init__(self, n: int = 4, deduplicate: bool = False, name: str = None, multiple_references_separator: str = None) -> None: """Instantiate BLEU evaluator. Args: n: Longest n-grams considered. deduplicate: Flag whether repated tokes should be treated as one. name: Name displayed in the logs and TensorBoard. multiple_references_separator: Token that separates multiple reference sentences. If ``None``, it assumes the reference is one sentence only. """ check_argument_types() if name is None: name = "BLEU-{}".format(n) if deduplicate: name += "-dedup" super().__init__(name) self.n = n self.deduplicate = deduplicate self.multiple_references_separator = multiple_references_separator
def __init__(self, output_series: str, encoder: GenericModelPart, attribute: str = "output", select_session: int = None) -> None: """Initialize the representation runner. Args: output_series: Name of the output series with vectors. encoder: The encoder to use. This can be any ``GenericModelPart`` object. attribute: The name of the encoder attribute that contains the data. used_session: Id of the TensorFlow session used in case of model ensembles. """ check_argument_types() if attribute not in dir(encoder): warn("The encoder '{}' seems not to have the specified " "attribute '{}'".format(encoder, attribute)) TensorRunner.__init__( self, output_series, modelparts=[encoder], tensors=[attribute], batch_dims=[0], tensors_by_name=[], batch_dims_by_name=[], select_session=select_session, single_tensor=True)
def __init__(self, name: str, dimension: int, data_id: str, output_shape: int = None, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Instantiate StatefulFiller. Arguments: name: Name of the model part. dimension: Dimensionality of the input. data_id: Series containing the numpy objects. output_shape: Dimension of optional state projection. """ check_argument_types() ModelPart.__init__( self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.data_id = data_id self.dimension = dimension self.output_shape = output_shape if self.dimension <= 0: raise ValueError("Input vector dimension must be positive.") if self.output_shape is not None and self.output_shape <= 0: raise ValueError("Output vector dimension must be positive.")
def __init__(self, name: str, input_shape: List[int], data_id: str, projection_dim: int = None, ff_hidden_dim: int = None, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Instantiate SpatialFiller. Args: name: Name of the model part. input_shape: Dimensionality of the input. data_id: Name of the data series with numpy objects. projection_dim: Optional, dimension of the states projection. """ check_argument_types() ModelPart.__init__( self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.data_id = data_id self.input_shape = input_shape self.projection_dim = projection_dim self.ff_hidden_dim = ff_hidden_dim if self.ff_hidden_dim is not None and self.projection_dim is None: raise ValueError( "projection_dim must be provided when using ff_hidden_dim") if len(self.input_shape) != 3: raise ValueError("The input shape should have 3 dimensions.")
def linear_encoder_projection(dropout_keep_prob: float) -> EncoderProjection: """Return a linear encoder projection. Return a projection function which applies dropout on concatenated encoder final states and returns a linear projection to a rnn_size-sized tensor. Arguments: dropout_keep_prob: The dropout keep probability """ check_argument_types() def func(train_mode: tf.Tensor, rnn_size: int, encoders: List[Stateful]) -> tf.Tensor: if rnn_size is None: raise ValueError( "You must supply rnn_size for this type of encoder projection") en_concat = concat_encoder_projection(train_mode, None, encoders) return dropout( tf.layers.dense(en_concat, rnn_size, name="encoders_projection"), dropout_keep_prob, train_mode) return cast(EncoderProjection, func)
def from_t2t_vocabulary(path: str, encoding: str = "utf-8") -> "Vocabulary": """Load a vocabulary generated during tensor2tensor training. Arguments: path: The path to the vocabulary file. encoding: The encoding of the vocabulary file (defaults to UTF-8). Returns: The new Vocabulary instantce. """ check_argument_types() vocabulary = [] # type: List[str] with open(path, encoding=encoding) as wordlist: for line in wordlist: line = line.strip() # T2T vocab tends to wrap words in single quotes if ((line.startswith("'") and line.endswith("'")) or (line.startswith('"') and line.endswith('"'))): line = line[1:-1] if line in ["<pad>", "<EOS>"]: continue vocabulary.append(line) log("Vocabulary form wordlist loaded, containing {} words" .format(len(vocabulary))) log_sample(vocabulary) return Vocabulary(vocabulary)
def __init__(self, name: str, parent_decoder: AutoregressiveDecoder, beam_size: int, max_steps: int, length_normalization: float) -> None: """Construct the beam search decoder graph. Arguments: name: The name for the model part. parent_decoder: An autoregressive decoder from which to sample. beam_size: The number of hypotheses in the beam. max_steps: The maximum number of time steps to perform. length_normalization: The alpha parameter from Eq. 14 in the paper. """ check_argument_types() ModelPart.__init__(self, name) self.parent_decoder = parent_decoder self.beam_size = beam_size self.length_normalization = length_normalization self.max_steps_int = max_steps # Create a placeholder for maximum number of steps that is necessary # during ensembling, when the decoder is called repetitively with the # max_steps attribute set to one. self.max_steps = tf.placeholder_with_default(self.max_steps_int, []) self._initial_loop_state = None # type: Optional[BeamSearchLoopState]
def __init__(self, name: str, smooth_method: str = "exp", smooth_value: float = 0.0, force: bool = False, lowercase: bool = False, tokenize: str = "none", use_effective_order: bool = False) -> None: check_argument_types() super().__init__(name) if tokenize not in TOKENIZERS: raise ValueError( "Unknown tokenizer '{}'. You must use one of sacrebleu's " "tokenizers: {}".format(tokenize, str(TOKENIZERS))) if smooth_method not in SMOOTH_VARIANTS: raise ValueError( "Unknown smoothing '{}'. You must use one of sacrebleu's " "smoothing methods: {}".format(smooth_method, str(SMOOTH_VARIANTS))) self.smooth_method = smooth_method self.smooth_value = smooth_value self.force = force self.lowercase = lowercase self.tokenize = tokenize self.use_effective_order = use_effective_order
def __init__( self, name: str, parent: TemporalStateful, factor: int, projection_size: int = None, projection_activation: Activation = None) -> None: """Initialize SentenceSplitter. Args: parent: TemporalStateful whose states will be split. factor: Factor by which the states will be split - the resulting sequence will be longer by this factor. projection_size: If not None, specifies dimensionality of a projection before state splitting. projection_activation: Non-linearity function for the optional projection. """ check_argument_types() ModelPart.__init__( self, name=name, save_checkpoint=None, load_checkpoint=None, initializers=None) self.parent = parent self.factor = factor self.projection_size = projection_size self.activation = projection_activation if projection_size is not None and projection_size % factor != 0: raise ValueError(( "Dimension of projection ({}) must be " "dividable by the given factor ({}).").format( projection_size, factor))
def __init__(self, name: str, n_heads: int, keys_encoder: Attendable, values_encoder: Attendable = None, dropout_keep_prob: float = 1.0, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: check_argument_types() BaseAttention.__init__(self, name, reuse, save_checkpoint, load_checkpoint, initializers) self.n_heads = n_heads self.dropout_keep_prob = dropout_keep_prob self.keys_encoder = keys_encoder if values_encoder is not None: self.values_encoder = values_encoder else: self.values_encoder = self.keys_encoder if self.n_heads <= 0: raise ValueError("Number of heads must be greater than zero.") if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0: raise ValueError("Dropout keep prob must be inside (0,1].") self._variable_scope.set_initializer(tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))
def single_tensor(files: List[str]) -> np.ndarray: """Load a single tensor from a numpy file.""" check_argument_types() if len(files) == 1: return np.load(files[0]) return np.concatenate([np.load(f) for f in files], axis=0)
def __init__(self, output_series: str, attention: BaseAttention, decoder: Decoder) -> None: check_argument_types() BaseRunner[BaseAttention].__init__(self, output_series, attention) self._key = "{}_run".format(decoder.name)
def __init__(self, name: str = None) -> None: check_argument_types() if name is None: name = type(self).__name__ if name.endswith("Evaluator"): name = name[:-9] self._name = name
def mapply(self, fn, args, kwargs): try: memo = typeguard._CallMemo(fn, args=args, kwargs=kwargs) typeguard.check_argument_types(memo) except TypeError as exc: print(exc) raise XMLRPCInvalidParamTypes(exc) return super().mapply(fn, args, kwargs)
def __call__(self, speech_mix: Union[torch.Tensor, np.ndarray], fs: int = 8000) -> List[torch.Tensor]: """Inference Args: speech_mix: Input speech data (Batch, Nsamples [, Channels]) fs: sample rate Returns: [separated_audio1, separated_audio2, ...] """ assert check_argument_types() # Input as audio signal if isinstance(speech_mix, np.ndarray): speech_mix = torch.as_tensor(speech_mix) assert speech_mix.dim() > 1, speech_mix.size() batch_size = speech_mix.size(0) speech_mix = speech_mix.to(getattr(torch, self.dtype)) # lengths: (B,) lengths = speech_mix.new_full([batch_size], dtype=torch.long, fill_value=speech_mix.size(1)) # a. To device speech_mix = to_device(speech_mix, device=self.device) lengths = to_device(lengths, device=self.device) if self.segmenting and lengths[0] > self.segment_size * fs: # Segment-wise speech enhancement/separation overlap_length = int( np.round(fs * (self.segment_size - self.hop_size))) num_segments = int( np.ceil((speech_mix.size(1) - overlap_length) / (self.hop_size * fs))) t = T = int(self.segment_size * fs) pad_shape = speech_mix[:, :T].shape enh_waves = [] range_ = trange if self.show_progressbar else range for i in range_(num_segments): st = int(i * self.hop_size * fs) en = st + T if en >= lengths[0]: # en - st < T (last segment) en = lengths[0] speech_seg = speech_mix.new_zeros(pad_shape) t = en - st speech_seg[:, :t] = speech_mix[:, st:en] else: t = T speech_seg = speech_mix[:, st:en] # B x T [x C] lengths_seg = speech_mix.new_full([batch_size], dtype=torch.long, fill_value=T) # b. Enhancement/Separation Forward feats, f_lens = self.enh_model.encoder(speech_seg, lengths_seg) feats, _, _ = self.enh_model.separator(feats, f_lens) processed_wav = [ self.enh_model.decoder(f, lengths_seg)[0] for f in feats ] if speech_seg.dim() > 2: # multi-channel speech speech_seg_ = speech_seg[:, self.ref_channel] else: speech_seg_ = speech_seg if self.normalize_segment_scale: # normalize the scale to match the input mixture scale mix_energy = torch.sqrt( torch.mean(speech_seg_[:, :t].pow(2), dim=1, keepdim=True)) enh_energy = torch.sqrt( torch.mean(sum(processed_wav)[:, :t].pow(2), dim=1, keepdim=True)) processed_wav = [ w * (mix_energy / enh_energy) for w in processed_wav ] # List[torch.Tensor(num_spk, B, T)] enh_waves.append(torch.stack(processed_wav, dim=0)) # c. Stitch the enhanced segments together waves = enh_waves[0] for i in range(1, num_segments): # permutation between separated streams in last and current segments perm = self.cal_permumation( waves[:, :, -overlap_length:], enh_waves[i][:, :, :overlap_length], criterion="si_snr", ) # repermute separated streams in current segment for batch in range(batch_size): enh_waves[i][:, batch] = enh_waves[i][perm[batch], batch] if i == num_segments - 1: enh_waves[i][:, :, t:] = 0 enh_waves_res_i = enh_waves[i][:, :, overlap_length:t] else: enh_waves_res_i = enh_waves[i][:, :, overlap_length:] # overlap-and-add (average over the overlapped part) waves[:, :, -overlap_length:] = ( waves[:, :, -overlap_length:] + enh_waves[i][:, :, :overlap_length]) / 2 # concatenate the residual parts of the later segment waves = torch.cat([waves, enh_waves_res_i], dim=2) # ensure the stitched length is same as input assert waves.size(2) == speech_mix.size(1), (waves.shape, speech_mix.shape) waves = torch.unbind(waves, dim=0) else: # b. Enhancement/Separation Forward feats, f_lens = self.enh_model.encoder(speech_mix, lengths) feats, _, _ = self.enh_model.separator(feats, f_lens) waves = [self.enh_model.decoder(f, lengths)[0] for f in feats] assert len(waves) == self.num_spk, len(waves) == self.num_spk assert len(waves[0]) == batch_size, (len(waves[0]), batch_size) if self.normalize_output_wav: waves = [ (w / abs(w).max(dim=1, keepdim=True)[0] * 0.9).cpu().numpy() for w in waves ] # list[(batch, sample)] else: waves = [w.cpu().numpy() for w in waves] return waves
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], allow_variable_data_keys: bool, segment_size: Optional[float], hop_size: Optional[float], normalize_segment_scale: bool, show_progressbar: bool, ref_channel: Optional[int], normalize_output_wav: bool, enh_s2t_task: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build separate_speech separate_speech_kwargs = dict( train_config=train_config, model_file=model_file, segment_size=segment_size, hop_size=hop_size, normalize_segment_scale=normalize_segment_scale, show_progressbar=show_progressbar, ref_channel=ref_channel, normalize_output_wav=normalize_output_wav, device=device, dtype=dtype, enh_s2t_task=enh_s2t_task, ) separate_speech = SeparateSpeech.from_pretrained( model_tag=model_tag, **separate_speech_kwargs, ) # 3. Build data-iterator loader = EnhancementTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=EnhancementTask.build_preprocess_fn( separate_speech.enh_train_args, False), collate_fn=EnhancementTask.build_collate_fn( separate_speech.enh_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop writers = [] for i in range(separate_speech.num_spk): writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} waves = separate_speech(**batch) for (spk, w) in enumerate(waves): for b in range(batch_size): writers[spk][keys[b]] = fs, w[b] for writer in writers: writer.close()
def run( cls, model: AbsESPnetModel, optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], train_iter_factory: AbsIterFactory, valid_iter_factory: AbsIterFactory, plot_attention_iter_factory: Optional[AbsIterFactory], reporter: Reporter, output_dir: Path, max_epoch: int, seed: int, patience: Optional[int], keep_nbest_models: int, early_stopping_criterion: Sequence[str], best_model_criterion: Sequence[Sequence[str]], val_scheduler_criterion: Sequence[str], trainer_options, distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) # NOTE(kamo): trainer_options doesn't always have "train_dtype" use_apex = getattr(trainer_options, "train_dtype", "") in ( "O0", "O1", "O2", "O3", ) if use_apex: try: from apex import amp except ImportError: logging.error("You need to install apex. " "See https://github.com/NVIDIA/apex#linux") start_epoch = reporter.get_epoch() + 1 if start_epoch == max_epoch + 1: logging.warning( f"The training has already reached at max_epoch: {start_epoch}" ) if distributed_option.distributed: # Use torch DDP instead of apex DDP # https://github.com/NVIDIA/apex/issues/494 dp_model = torch.nn.parallel.DistributedDataParallel( model, device_ids=( # Perform multi-Process with multi-GPUs [torch.cuda.current_device()] if distributed_option.ngpu == 1 # Perform single-Process with multi-GPUs else None), output_device=(torch.cuda.current_device() if distributed_option.ngpu == 1 else None), ) elif distributed_option.ngpu > 1: # apex.amp supports DataParallel now. dp_model = torch.nn.parallel.DataParallel( model, device_ids=list(range(distributed_option.ngpu)), ) else: # NOTE(kamo): DataParallel also should work with ngpu=1, # but for debuggability it's better to keep this block. dp_model = model if not distributed_option.distributed or distributed_option.dist_rank == 0: summary_writer = SummaryWriter(str(output_dir / "tensorboard")) else: summary_writer = None start_time = time.perf_counter() for iepoch in range(start_epoch, max_epoch + 1): if iepoch != start_epoch: logging.info( "{}/{}epoch started. Estimated time to finish: {}".format( iepoch, max_epoch, humanfriendly.format_timespan( (time.perf_counter() - start_time) / (iepoch - start_epoch) * (max_epoch - iepoch + 1)), )) else: logging.info(f"{iepoch}/{max_epoch}epoch started") set_all_random_seed(seed + iepoch) reporter.set_epoch(iepoch) # 1. Train and validation for one-epoch with reporter.observe("train") as sub_reporter: all_steps_are_invalid = cls.train_one_epoch( model=dp_model, optimizers=optimizers, schedulers=schedulers, iterator=train_iter_factory.build_iter(iepoch), reporter=sub_reporter, summary_writer=summary_writer, options=trainer_options, ) with reporter.observe("valid") as sub_reporter: cls.validate_one_epoch( model=dp_model, iterator=valid_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, ) if not distributed_option.distributed or distributed_option.dist_rank == 0: # att_plot doesn't support distributed if plot_attention_iter_factory is not None: with reporter.observe("att_plot") as sub_reporter: cls.plot_attention( model=model, output_dir=output_dir / "att_ws", summary_writer=summary_writer, iterator=plot_attention_iter_factory.build_iter( iepoch), reporter=sub_reporter, options=trainer_options, ) # 2. LR Scheduler step for scheduler in schedulers: if isinstance(scheduler, AbsValEpochStepScheduler): scheduler.step( reporter.get_value(*val_scheduler_criterion)) elif isinstance(scheduler, AbsEpochStepScheduler): scheduler.step() if not distributed_option.distributed or distributed_option.dist_rank == 0: # 3. Report the results logging.info(reporter.log_message()) reporter.matplotlib_plot(output_dir / "images") reporter.tensorboard_add_scalar(summary_writer) # 4. Save/Update the checkpoint torch.save( { "model": model.state_dict(), "reporter": reporter.state_dict(), "optimizers": [o.state_dict() for o in optimizers], "schedulers": [ s.state_dict() if s is not None else None for s in schedulers ], "amp": amp.state_dict() if use_apex else None, }, output_dir / "checkpoint.pth", ) # 5. Save the model and update the link to the best model torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") # Creates a sym link latest.pth -> {iepoch}epoch.pth p = output_dir / "latest.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved = [] for _phase, k, _mode in best_model_criterion: # e.g. _phase, k, _mode = "train", "loss", "min" if reporter.has(_phase, k): best_epoch = reporter.get_best_epoch(_phase, k, _mode) # Creates sym links if it's the best result if best_epoch == iepoch: p = output_dir / f"{_phase}.{k}.best.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") else: logging.info("The best model has been updated: " + ", ".join(_improved)) # 6. Remove the model files excluding n-best epoch and latest epoch _removed = [] # Get the union set of the n-best among multiple criterion nbests = set().union(*[ set(reporter.sort_epochs(ph, k, m)[:keep_nbest_models]) for ph, k, m in best_model_criterion if reporter.has(ph, k) ]) for e in range(1, iepoch): p = output_dir / f"{e}epoch.pth" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) if len(_removed) != 0: logging.info("The model files were removed: " + ", ".join(_removed)) # 7. If any updating haven't happened, stops the training if all_steps_are_invalid: logging.warning( f"The gradients at all steps are invalid in this epoch. " f"Something seems wrong. This training was stopped at {iepoch}epoch" ) break # 8. Check early stopping if patience is not None: if reporter.check_early_stopping(patience, *early_stopping_criterion): break else: logging.info(f"The training was finished at {max_epoch} epochs ")
def train_one_epoch( cls, model: torch.nn.Module, iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], reporter: SubReporter, summary_writer: Optional[SummaryWriter], options: TrainerOptions, ) -> bool: assert check_argument_types() # Note(kamo): assumes one optimizer assert cls.num_optimizers == 1, cls.num_optimizers assert len(optimizers) == 1, len(optimizers) optimizer = optimizers[0] scheduler = schedulers[0] grad_noise = options.grad_noise accum_grad = options.accum_grad grad_clip = options.grad_clip log_interval = options.log_interval no_forward_run = options.no_forward_run ngpu = options.ngpu distributed = isinstance(model, torch.nn.parallel.DistributedDataParallel) use_apex = options.train_dtype in ("O0", "O1", "O2", "O3") if log_interval is None: try: log_interval = max(len(iterator) // 20, 10) except TypeError: log_interval = 100 model.train() all_steps_are_invalid = True # [For distributed] Because iteration counts are not always equals between # processes, send stop-flag to the other processes if iterator is finished iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu") start_time = time.perf_counter() for iiter, (_, batch) in enumerate( reporter.measure_iter_time(iterator, "iter_time"), 1): assert isinstance(batch, dict), type(batch) if distributed: torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) if iterator_stop > 0: break batch = to_device(batch, "cuda" if ngpu > 0 else "cpu") if no_forward_run: all_steps_are_invalid = False continue with reporter.measure_time("forward_time"): loss, stats, weight = model(**batch) stats = {k: v for k, v in stats.items() if v is not None} if ngpu > 1 or distributed: # Apply weighted averaging for loss and stats loss = (loss * weight.type(loss.dtype)).sum() # if distributed, this method can also apply all_reduce() stats, weight = recursive_average(stats, weight, distributed) # Now weight is summation over all workers loss /= weight if distributed: # NOTE(kamo): Multiply world_size because DistributedDataParallel # automatically normalizes the gradient by world_size. loss *= torch.distributed.get_world_size() reporter.register(stats, weight) loss /= accum_grad with reporter.measure_time("backward_time"): if use_apex: try: from apex import amp except ImportError: logging.error( "You need to install apex. " "See https://github.com/NVIDIA/apex#linux") with amp.scale_loss(loss, optimizers) as scaled_loss: scaled_loss.backward() else: loss.backward() if iiter % accum_grad == 0: # gradient noise injection if grad_noise: add_gradient_noise( model, reporter.get_total_count(), duration=100, eta=1.0, scale_factor=0.55, ) # compute the gradient norm to check if it is normal or not grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip) # PyTorch<=1.4, clip_grad_norm_ returns float value if not isinstance(grad_norm, torch.Tensor): grad_norm = torch.tensor(grad_norm) if not torch.isfinite(grad_norm): logging.warning( f"The grad norm is {grad_norm}. Skipping updating the model." ) else: all_steps_are_invalid = False with reporter.measure_time("optim_step_time"): optimizer.step() if isinstance(scheduler, AbsBatchStepScheduler): scheduler.step() optimizer.zero_grad() # Register lr and train/load time[sec/step], # where step refers to accum_grad * mini-batch reporter.register( dict( { f"lr_{i}": pg["lr"] for i, pg in enumerate(optimizer.param_groups) if "lr" in pg }, train_time=time.perf_counter() - start_time, ), ) start_time = time.perf_counter() # NOTE(kamo): Call log_message() after next() reporter.next() if iiter % log_interval == 0: logging.info(reporter.log_message(-log_interval)) if summary_writer is not None: reporter.tensorboard_add_scalar(summary_writer, -log_interval) else: if distributed: iterator_stop.fill_(1) torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) return all_steps_are_invalid
def plot_attention( cls, model: torch.nn.Module, output_dir: Optional[Path], summary_writer: Optional[SummaryWriter], iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], reporter: SubReporter, options: TrainerOptions, ) -> None: assert check_argument_types() import matplotlib ngpu = options.ngpu no_forward_run = options.no_forward_run matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator model.eval() for ids, batch in iterator: assert isinstance(batch, dict), type(batch) assert len(next(iter(batch.values()))) == len(ids), ( len(next(iter(batch.values()))), len(ids), ) batch = to_device(batch, "cuda" if ngpu > 0 else "cpu") if no_forward_run: continue # 1. Forwarding model and gathering all attentions # calculate_all_attentions() uses single gpu only. att_dict = calculate_all_attentions(model, batch) # 2. Plot attentions: This part is slow due to matplotlib for k, att_list in att_dict.items(): assert len(att_list) == len(ids), (len(att_list), len(ids)) for id_, att_w in zip(ids, att_list): if isinstance(att_w, torch.Tensor): att_w = att_w.detach().cpu().numpy() if att_w.ndim == 2: att_w = att_w[None] elif att_w.ndim > 3 or att_w.ndim == 1: raise RuntimeError( f"Must be 2 or 3 dimension: {att_w.ndim}") w, h = plt.figaspect(1.0 / len(att_w)) fig = plt.Figure(figsize=(w * 1.3, h * 1.3)) axes = fig.subplots(1, len(att_w)) if len(att_w) == 1: axes = [axes] for ax, aw in zip(axes, att_w): ax.imshow(aw.astype(np.float32), aspect="auto") ax.set_title(f"{k}_{id_}") ax.set_xlabel("Input") ax.set_ylabel("Output") ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) if output_dir is not None: p = output_dir / id_ / f"{k}.{reporter.get_epoch()}ep.png" p.parent.mkdir(parents=True, exist_ok=True) fig.savefig(p) if summary_writer is not None: summary_writer.add_figure(f"{k}_{id_}", fig, reporter.get_epoch()) reporter.next()
def build_options(cls, args: argparse.Namespace) -> TrainerOptions: """Build options consumed by train(), eval(), and plot_attention()""" assert check_argument_types() return build_dataclass(TrainerOptions, args)
def __init__( self, # TODO only stateful, attention will need temporal or spat. encoders: List[Union[TemporalStatefulWithOutput, SpatialStatefulWithOutput]], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, rnn_size: int = None, embedding_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: Callable[ [tf.Tensor, Optional[int], Optional[List[Any]]], tf.Tensor] = None, attentions: List[BaseAttention] = None, embeddings_source: EmbeddedSequence = None, attention_on_input: bool = True, rnn_cell: str = 'GRU', conditional_gru: bool = False, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder vocabulary: Target vocabulary data_id: Target data series name: Name of the decoder. Should be unique accross all Neural Monkey objects max_output_len: Maximum length of an output sequence dropout_keep_prob: Probability of keeping a value during dropout Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. embedding_size: Size of embedding vectors for target words output_projection: How to generate distribution over vocabulary from decoder rnn_outputs encoder_projection: How to construct initial state from encoders attention: The attention object to use. Optional. embeddings_source: Embedded sequence to take embeddings from rnn_cell: RNN Cell used by the decoder (GRU or LSTM) conditional_gru: Flag whether to use the Conditional GRU architecture attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) check_argument_types() log("Initializing decoder, name: '{}'".format(name)) self.encoders = encoders self.vocabulary = vocabulary self.data_id = data_id self.max_output_len = max_output_len self.dropout_keep_prob = dropout_keep_prob self.embedding_size = embedding_size self.rnn_size = rnn_size self.output_projection_spec = output_projection self.encoder_projection = encoder_projection self.attentions = attentions self.embeddings_source = embeddings_source self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell if self.attentions is None: self.attentions = [] if self.embedding_size is None and self.embeddings_source is None: raise ValueError("You must specify either embedding size or the " "embedded sequence from which to reuse the " "embeddings (e.g. set either 'embedding_size' or " " 'embeddings_source' parameter)") if self.embeddings_source is not None: if self.embedding_size is not None: warn("Overriding the embedding_size parameter with the" " size of the reused embeddings from the encoder.") self.embedding_size = ( self.embeddings_source.embedding_matrix.get_shape()[1].value) if self.encoder_projection is None: if not self.encoders: log("No encoder - language model only.") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.output.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU' or 'LSTM'") if self.output_projection_spec is None: log("No output projection specified - using tanh projection") self.output_projection = nonlinear_output(self.rnn_size, tf.tanh)[0] self.output_projection_size = self.rnn_size elif isinstance(self.output_projection_spec, tuple): (self.output_projection, self.output_projection_size) = tuple(self.output_projection_spec) else: self.output_projection = self.output_projection_spec self.output_projection_size = self.rnn_size if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = self.embed_input_symbol with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass # TODO when it is possible, remove the printing of the cost var log("Decoder initalized. Cost var: {}".format(str(self.cost))) log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel: assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] # "args" is saved as it is in a yaml file by BaseTask.main(). # Overwriting token_list to keep it as "portable". args.token_list = token_list.copy() elif isinstance(args.token_list, (tuple, list)): token_list = args.token_list.copy() else: raise RuntimeError("token_list must be str or dict") vocab_size = len(token_list) logging.info(f"Vocabulary size: {vocab_size }") # 1. feats_extract if args.odim is None: # Extract features in the model feats_extract_class = feats_extractor_choices.get_class( args.feats_extract) feats_extract = feats_extract_class(**args.feats_extract_conf) odim = feats_extract.output_size() else: # Give features from data-loader args.feats_extract = None args.feats_extract_conf = None feats_extract = None odim = args.odim # 2. Normalization layer if args.normalize is not None: normalize_class = normalize_choices.get_class(args.normalize) normalize = normalize_class(**args.normalize_conf) else: normalize = None # 3. TTS tts_class = tts_choices.get_class(args.tts) tts = tts_class(idim=vocab_size, odim=odim, **args.tts_conf) # 4. Extra components pitch_extract = None energy_extract = None pitch_normalize = None energy_normalize = None if getattr(args, "pitch_extract", None) is not None: pitch_extract_class = pitch_extractor_choices.get_class( args.pitch_extract) if args.pitch_extract_conf.get("reduction_factor", None) is not None: assert args.pitch_extract_conf.get("reduction_factor", None) == args.tts_conf.get( "reduction_factor", 1) else: args.pitch_extract_conf[ "reduction_factor"] = args.tts_conf.get( "reduction_factor", 1) pitch_extract = pitch_extract_class(**args.pitch_extract_conf) if getattr(args, "energy_extract", None) is not None: if args.energy_extract_conf.get("reduction_factor", None) is not None: assert args.energy_extract_conf.get("reduction_factor", None) == args.tts_conf.get( "reduction_factor", 1) else: args.energy_extract_conf[ "reduction_factor"] = args.tts_conf.get( "reduction_factor", 1) energy_extract_class = energy_extractor_choices.get_class( args.energy_extract) energy_extract = energy_extract_class(**args.energy_extract_conf) if getattr(args, "pitch_normalize", None) is not None: pitch_normalize_class = pitch_normalize_choices.get_class( args.pitch_normalize) pitch_normalize = pitch_normalize_class( **args.pitch_normalize_conf) if getattr(args, "energy_normalize", None) is not None: energy_normalize_class = energy_normalize_choices.get_class( args.energy_normalize) energy_normalize = energy_normalize_class( **args.energy_normalize_conf) # 5. Build model model = ESPnetTTSModel( feats_extract=feats_extract, pitch_extract=pitch_extract, energy_extract=energy_extract, normalize=normalize, pitch_normalize=pitch_normalize, energy_normalize=energy_normalize, tts=tts, **args.model_conf, ) assert check_return_type(model) return model
def add_task_arguments(cls, parser: argparse.ArgumentParser): # NOTE(kamo): Use '_' instead of '-' to avoid confusion assert check_argument_types() group = parser.add_argument_group(description="Task related") # NOTE(kamo): add_arguments(..., required=True) can't be used # to provide --print_config mode. Instead of it, do as required = parser.get_default("required") required += ["token_list"] group.add_argument( "--token_list", type=str_or_none, default=None, help="A text mapping int-id to token", ) group.add_argument( "--odim", type=int_or_none, default=None, help="The number of dimension of output feature", ) group.add_argument( "--model_conf", action=NestedDictAction, default=get_default_kwargs(ESPnetTTSModel), help="The keyword arguments for model class.", ) group = parser.add_argument_group(description="Preprocess related") group.add_argument( "--use_preprocessor", type=str2bool, default=True, help="Apply preprocessing to data or not", ) group.add_argument( "--token_type", type=str, default="phn", choices=["bpe", "char", "word", "phn"], help="The text will be tokenized in the specified level token", ) group.add_argument( "--bpemodel", type=str_or_none, default=None, help="The model file of sentencepiece", ) parser.add_argument( "--non_linguistic_symbols", type=str_or_none, help="non_linguistic_symbols file path", ) parser.add_argument( "--cleaner", type=str_or_none, choices=[None, "tacotron", "jaconv", "vietnamese"], default=None, help="Apply text cleaning", ) parser.add_argument( "--g2p", type=str_or_none, choices=[ None, "g2p_en", "g2p_en_no_space", "pyopenjtalk", "pyopenjtalk_accent", "pyopenjtalk_kana", "pypinyin_g2p", "pypinyin_g2p_phone", ], default=None, help="Specify g2p method if --token_type=phn", ) for class_choices in cls.class_choices_list: # Append --<name> and --<name>_conf. # e.g. --encoder and --encoder_conf class_choices.add_arguments(group)
def __init__(self, name: str, data_id: str, input_size: int, rnn_layers: List[RNNSpecTuple], max_input_len: Optional[int] = None, dropout_keep_prob: float = 1.0, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None, initializers: InitializerSpecs = None) -> None: """Create a new instance of the encoder. Arguments: data_id: Identifier of the data series fed to this encoder name: An unique identifier for this encoder rnn_layers: A list of tuples specifying the size and, optionally, the direction ('forward', 'backward' or 'bidirectional') and cell type ('GRU' or 'LSTM') of each RNN layer. Keyword arguments: dropout_keep_prob: The dropout keep probability (default 1.0) """ check_argument_types() ModelPart.__init__(self, name, save_checkpoint, load_checkpoint, initializers) self.data_id = data_id self._rnn_layers = [_make_rnn_spec(*r) for r in rnn_layers] self.max_input_len = max_input_len self.input_size = input_size self.dropout_keep_prob = dropout_keep_prob log("Initializing RNN encoder, name: '{}'".format(self.name)) with self.use_scope(): self.inputs = tf.placeholder(tf.float32, [None, None, self.input_size], "encoder_input") self._input_lengths = tf.placeholder(tf.int32, [None], "encoder_padding_lengths") self.states_mask = tf.sequence_mask(self._input_lengths, dtype=tf.float32) states = self.inputs states_reversed = False def reverse_states(): nonlocal states, states_reversed states = tf.reverse_sequence(states, self._input_lengths, batch_dim=0, seq_dim=1) states_reversed = not states_reversed for i, layer in enumerate(self._rnn_layers): with tf.variable_scope("rnn_{}_{}".format(i, layer.direction)): if layer.direction == "bidirectional": fw_cell = _make_rnn_cell(layer) bw_cell = _make_rnn_cell(layer) outputs_tup, encoded_tup = ( tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, states, self._input_lengths, dtype=tf.float32)) if states_reversed: # treat forward as backward and vice versa outputs_tup = tuple(reversed(outputs_tup)) encoded_tup = tuple(reversed(encoded_tup)) states_reversed = False states = tf.concat(outputs_tup, 2) encoded = tf.concat(encoded_tup, 1) elif layer.direction in ["forward", "backward"]: should_be_reversed = (layer.direction == "backward") if states_reversed != should_be_reversed: reverse_states() cell = _make_rnn_cell(layer) states, encoded = tf.nn.dynamic_rnn( cell, states, sequence_length=self._input_lengths, dtype=tf.float32) else: raise ValueError("Unknown RNN direction {}".format( layer.direction)) if i < len(self._rnn_layers) - 1: states = dropout(states, self.dropout_keep_prob, self.train_mode) if states_reversed: reverse_states() self.hidden_states = states self.encoded = encoded log("RNN encoder initialized")
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device ) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search( x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio ) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def run_application(component: Component, *, event_loop_policy: str = None, max_threads: int = None, logging: Union[Dict[str, Any], int, None] = INFO): """ Configure logging and start the given root component in the default asyncio event loop. Assuming the root component was started successfully, the event loop will continue running until the process is terminated. Initializes the logging system first based on the value of ``logging``: * If the value is a dictionary, it is passed to :func:`logging.config.dictConfig` as argument. * If the value is an integer, it is passed to :func:`logging.basicConfig` as the logging level. * If the value is ``None``, logging setup is skipped entirely. By default, the logging system is initialized using :func:`~logging.basicConfig` using the ``INFO`` logging level. The default executor in the event loop is replaced with a new :class:`~concurrent.futures.ThreadPoolExecutor` where the maximum number of threads is set to the value of ``max_threads`` or, if omitted, the return value of :func:`os.cpu_count()`. :param component: the root component :param event_loop_policy: entry point name (from the ``asphalt.core.event_loop_policies`` namespace) of an alternate event loop policy (or a module:varname reference to one) :param max_threads: the maximum number of worker threads in the default thread pool executor (the default value depends on the event loop implementation) :param logging: a logging configuration dictionary, :ref:`logging level <python:levels>` or ``None`` """ assert check_argument_types() # Configure the logging system if isinstance(logging, dict): dictConfig(logging) elif isinstance(logging, int): basicConfig(level=logging) # Switch to an alternate event loop policy if one was provided logger = getLogger(__name__) if event_loop_policy: create_policy = policies.resolve(event_loop_policy) policy = create_policy() asyncio.set_event_loop_policy(policy) logger.info('Switched event loop policy to %s', qualified_name(policy)) # Assign a new default executor with the given max worker thread limit if one was provided event_loop = asyncio.get_event_loop() if max_threads is not None: event_loop.set_default_executor(ThreadPoolExecutor(max_threads)) logger.info('Installed a new thread pool executor with max_workers=%d', max_threads) logger.info('Starting application') context = Context() exception = None try: try: event_loop.run_until_complete(component.start(context)) except Exception as e: exception = e logger.exception('Error during application startup') else: # Enable garbage collection of the component tree del component # Finally, run the event loop until the process is terminated or Ctrl+C is pressed event_loop.run_forever() except (KeyboardInterrupt, SystemExit): pass finally: # Cancel all running tasks for task in asyncio.Task.all_tasks(event_loop): task.cancel() # Run all the finish callbacks future = context.finished.dispatch(exception, return_future=True) event_loop.run_until_complete(future) event_loop.close() logger.info('Application stopped') if exception is not None: sys.exit(1)
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]]) -> Dict[str, np.ndarray]: assert check_argument_types() if self.speech_name in data: if self.train and self.rirs is not None and self.noises is not None: speech = data[self.speech_name] nsamples = len(speech) # speech: (Nmic, Time) if speech.ndim == 1: speech = speech[None, :] else: speech = speech.T # Calc power on non shlence region power = (speech[detect_non_silence(speech)]**2).mean() # 1. Convolve RIR if self.rirs is not None and self.rir_apply_prob >= np.random.random( ): rir_path = np.random.choice(self.rirs) if rir_path is not None: rir, _ = soundfile.read(rir_path, dtype=np.float64, always_2d=True) # rir: (Nmic, Time) rir = rir.T # speech: (Nmic, Time) # Note that this operation doesn't change the signal length speech = scipy.signal.convolve( speech, rir, mode="full")[:, :speech.shape[1]] # Reverse mean power to the original power power2 = (speech[detect_non_silence(speech)]**2).mean() speech = np.sqrt(power / max(power2, 1e-10)) * speech # 2. Add Noise if (self.noises is not None and self.rir_apply_prob >= np.random.random()): noise_path = np.random.choice(self.noises) if noise_path is not None: noise_db = np.random.uniform(self.noise_db_low, self.noise_db_high) with soundfile.SoundFile(noise_path) as f: if f.frames == nsamples: noise = f.read(dtype=np.float64, always_2d=True) elif f.frames < nsamples: offset = np.random.randint( 0, nsamples - f.frames) # noise: (Time, Nmic) noise = f.read(dtype=np.float64, always_2d=True) # Repeat noise noise = np.pad( noise, [(offset, nsamples - f.frames - offset), (0, 0)], mode="wrap", ) else: offset = np.random.randint( 0, f.frames - nsamples) f.seek(offset) # noise: (Time, Nmic) noise = f.read(nsamples, dtype=np.float64, always_2d=True) if len(noise) != nsamples: raise RuntimeError( f"Something wrong: {noise_path}") # noise: (Nmic, Time) noise = noise.T noise_power = (noise**2).mean() scale = (10**(-noise_db / 20) * np.sqrt(power) / np.sqrt(max(noise_power, 1e-10))) speech = speech + scale * noise speech = speech.T ma = np.max(np.abs(speech)) if ma > 1.0: speech /= ma data[self.speech_name] = speech if self.speech_volume_normalize is not None: speech = data[self.speech_name] ma = np.max(np.abs(speech)) data[self. speech_name] = speech * self.speech_volume_normalize / ma if self.text_name in data and self.tokenizer is not None: text = data[self.text_name] text = self.text_cleaner(text) tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[self.text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = Speech2Text( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } # N-best list of (text, token, token_int, hyp_object) results = speech2text(**batch) # Only supporting batch_size==1 key = keys[0] for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def __init__(self, num_sessions: int, num_threads: int, save_n_best: int = 1, minimize_metric: bool = False, variable_files: Optional[List[str]] = None, gpu_allow_growth: bool = True, per_process_gpu_memory_fraction: float = 1.0, enable_tf_debug: bool = False) -> None: """Initialize a TensorflowManager. At this moment the graph must already exist. This method initializes required number of TensorFlow sessions and initializes them with provided variable files if they are provided. Args: num_sessions: Number of sessions to be initialized. num_threads: Number of threads sessions will run in. save_n_best: How many best models to keep minimize_metric: Whether the best model is the one with the lowest or the highest score variable_files: List of variable files. gpu_allow_growth: TF to allocate incrementally, not all at once. per_process_gpu_memory_fraction: Limit TF memory use. """ check_argument_types() session_cfg = tf.ConfigProto() session_cfg.inter_op_parallelism_threads = num_threads session_cfg.intra_op_parallelism_threads = num_threads session_cfg.allow_soft_placement = True # needed for multiple GPUs # pylint: disable=no-member session_cfg.gpu_options.allow_growth = gpu_allow_growth session_cfg.gpu_options.per_process_gpu_memory_fraction = \ per_process_gpu_memory_fraction if save_n_best < 1: raise Exception("save_n_best parameter must be greater than zero") self.saver_max_to_keep = save_n_best self.minimize_metric = minimize_metric self.sessions = [ tf.Session(config=session_cfg) for _ in range(num_sessions) ] if enable_tf_debug: self.sessions = [ tf_debug.LocalCLIDebugWrapperSession(sess) for sess in self.sessions ] init_op = tf.global_variables_initializer() for sess in self.sessions: sess.run(init_op) self.saver = tf.train.Saver(max_to_keep=self.saver_max_to_keep, var_list=[ g for g in tf.global_variables() if "reward_" not in g.name ]) if variable_files: if len(variable_files) != num_sessions: raise Exception( ("The number of provided variable files ({}) " "is different than a number sessions ({})").format( len(variable_files), num_sessions)) self.restore(variable_files) self.best_score_index = 0 self.best_score_epoch = 0 self.best_score_batch = 0 init_score = np.inf if self.minimize_metric else -np.inf self.saved_scores = [init_score for _ in range(self.saver_max_to_keep)] self.best_score = init_score self.variables_files = [] # type: List[str] self._best_vars_file = None # type: Optional[str]
def __init__(self, name: str, input_sequence: Sequence, segment_size: int, highway_depth: int, rnn_size: int, filters: List[Tuple[int, int]], dropout_keep_prob: float = 1.0, use_noisy_activations: bool = False, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None) -> None: """Create a new instance of the sentence encoder. Arguments: name: An unique identifier for this encoder segment_size: The size of the segments over which we apply max-pooling. highway_depth: Depth of the highway layer. rnn_size: The size of the encoder's hidden state. Note that the actual encoder output state size will be twice as long because it is the result of concatenation of forward and backward hidden states. filters: Specification of CNN filters. It is a list of tuples specifying the filter size and number of channels. Keyword arguments: dropout_keep_prob: The dropout keep probability (default 1.0) """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) check_argument_types() self.input_sequence = input_sequence self.segment_size = segment_size self.highway_depth = highway_depth self.rnn_size = rnn_size self.filters = filters self.dropout_keep_prob = dropout_keep_prob self.use_noisy_activations = use_noisy_activations if dropout_keep_prob <= 0. or dropout_keep_prob > 1.: raise ValueError(("Dropout keep probability must be " "in (0; 1], was {}").format(dropout_keep_prob)) if rnn_size <= 0: raise ValueError("RNN size must be a positive integer.") if highway_depth <= 0: raise ValueError("Highway depth must be a positive integer.") if segment_size <= 0: raise ValueError("Segment size be a positive integer.") if not filters: raise ValueError("You must specify convolutional filters.") for filter_size, num_filters in self.filters: if filter_size <= 0: raise ValueError("Filter size must be a positive integer.") if num_filters <= 0: raise ValueError("Number of filters must be a positive int.")
def inference( output_dir: str, batch_size: int, dtype: str, fs: int, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], enh_train_config: str, enh_model_file: str, allow_variable_data_keys: bool, normalize_output_wav: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build Enh model enh_model, enh_train_args = EnhancementTask.build_model_from_file( enh_train_config, enh_model_file, device) enh_model.eval() num_spk = enh_model.num_spk # 3. Build data-iterator loader = EnhancementTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=EnhancementTask.build_preprocess_fn( enh_train_args, False), collate_fn=EnhancementTask.build_collate_fn(enh_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) writers = [] for i in range(num_spk): writers.append( SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")) for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Enhancement Frontend feats, f_lens = enh_model.encoder(batch["speech_mix"], batch["speech_mix_lengths"]) feats, _, _ = enh_model.separator(feats, f_lens) waves = [ enh_model.decoder(f, batch["speech_mix_lengths"])[0] for f in feats ] assert len(waves[0]) == batch_size, len(waves[0]) # FIXME(Chenda): will be incorrect when # batch size is not 1 or multi-channel case if normalize_output_wav: waves = [ (w / abs(w).max(dim=1, keepdim=True)[0] * 0.9).T.cpu().numpy() for w in waves ] # list[(sample,batch)] else: waves = [w.T.cpu().numpy() for w in waves] for (i, w) in enumerate(waves): writers[i][keys[0]] = fs, w for writer in writers: writer.close()
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, pos_enc_class=StreamPositionalEncoding, selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 31, padding_idx: int = -1, block_size: int = 40, hop_size: int = 16, look_ahead: int = 16, init_average: bool = True, ctx_pos_enc: bool = True, ): assert check_argument_types() super().__init__() self._output_size = output_size self.pos_enc = pos_enc_class(output_size, positional_dropout_rate) activation = get_activation(activation_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), ) self.subsample = 1 elif input_layer == "conv2d": self.embed = Conv2dSubsamplingWOPosEnc( input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2] ) self.subsample = 4 elif input_layer == "conv2d6": self.embed = Conv2dSubsamplingWOPosEnc( input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3] ) self.subsample = 6 elif input_layer == "conv2d8": self.embed = Conv2dSubsamplingWOPosEnc( input_size, output_size, dropout_rate, kernels=[3, 3, 3], strides=[2, 2, 2], ) self.subsample = 8 elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), ) self.subsample = 1 elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) self.subsample = 1 elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) self.subsample = 1 else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: ContextualBlockEncoderLayer( output_size, MultiHeadedAttention( attention_heads, output_size, attention_dropout_rate ), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, num_blocks, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) # for block processing self.block_size = block_size self.hop_size = hop_size self.look_ahead = look_ahead self.init_average = init_average self.ctx_pos_enc = ctx_pos_enc
def rl_objective(decoder: Decoder, reward_function: RewardFunction, subtract_baseline: bool = False, normalize: bool = False, temperature: float = 1., ce_smoothing: float = 0., alpha: float = 1., sample_size: int = 1) -> Objective: """Construct RL objective for training with sentence-level feedback. Depending on the options the objective corresponds to: 1) sample_size = 1, normalize = False, ce_smoothing = 0.0 Bandit objective (Eq. 2) described in 'Bandit Structured Prediction for Neural Sequence-to-Sequence Learning' (http://www.aclweb.org/anthology/P17-1138) It's recommended to set subtract_baseline = True. 2) sample_size > 1, normalize = True, ce_smoothing = 0.0 Minimum Risk Training as described in 'Minimum Risk Training for Neural Machine Translation' (http://www.aclweb.org/anthology/P16-1159) (Eq. 12). 3) sample_size > 1, normalize = False, ce_smoothing = 0.0 The Google 'Reinforce' objective as proposed in 'Google’s NMT System: Bridging the Gap between Human and Machine Translation' (https://arxiv.org/pdf/1609.08144.pdf) (Eq. 8). 4) sample_size > 1, normalize = False, ce_smoothing > 0.0 Google's 'Mixed' objective in the above paper (Eq. 9), where ce_smoothing implements alpha. Note that 'alpha' controls the sharpness of the normalized distribution, while 'temperature' controls the sharpness during sampling. :param decoder: a recurrent decoder to sample from :param reward_function: any evaluator object :param subtract_baseline: avg reward is subtracted from obtained reward :param normalize: the probabilities of the samples are re-normalized :param sample_size: number of samples to obtain feedback for :param ce_smoothing: add cross-entropy loss with this coefficient to loss :param alpha: determines the shape of the normalized distribution :param temperature: the softmax temperature for sampling :return: Objective object to be used in generic trainer """ check_argument_types() reference = decoder.train_inputs def _score_with_reward_function(references: np.array, hypotheses: np.array) -> np.array: """Score (time, batch) arrays with sentence-based reward function. Parts of the sentence after generated <pad> or </s> are ignored. BPE-postprocessing is also included. :param references: array of indices of references, shape (time, batch) :param hypotheses: array of indices of hypotheses, shape (time, batch) :return: an array of batch length with float rewards """ rewards = [] for refs, hyps in zip(references.transpose(), hypotheses.transpose()): ref_seq = [] hyp_seq = [] for r_token in refs: token = decoder.vocabulary.index_to_word[r_token] if token in (END_TOKEN, PAD_TOKEN): break ref_seq.append(token) for h_token in hyps: token = decoder.vocabulary.index_to_word[h_token] if token in (END_TOKEN, PAD_TOKEN): break hyp_seq.append(token) # join BPEs, split on " " to prepare list for evaluator refs_tokens = " ".join(ref_seq).replace("@@ ", "").split(" ") hyps_tokens = " ".join(hyp_seq).replace("@@ ", "").split(" ") reward = float(reward_function([hyps_tokens], [refs_tokens])) rewards.append(reward) return np.array(rewards, dtype=np.float32) samples_rewards = [] samples_logprobs = [] for _ in range(sample_size): # sample from logits # decoded, shape (time, batch) sample_loop_result = decoder.decoding_loop(train_mode=False, sample=True, temperature=temperature) sample_logits = sample_loop_result[0] sample_decoded = sample_loop_result[3] # rewards, shape (batch) # simulate from reference sample_reward = tf.py_func(_score_with_reward_function, [reference, sample_decoded], tf.float32) # pylint: disable=invalid-unary-operand-type word_logprobs = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sample_decoded, logits=sample_logits) # sum word log prob to sentence log prob # no masking here, since otherwise shorter sentences are preferred sent_logprobs = tf.reduce_sum(word_logprobs, axis=0) samples_rewards.append(sample_reward) # sample_size x batch samples_logprobs.append(sent_logprobs) # sample_size x batch # stack samples, sample_size x batch samples_rewards_stacked = tf.stack(samples_rewards) samples_logprobs_stacked = tf.stack(samples_logprobs) if subtract_baseline: # if specified, compute the average reward baseline reward_counter = tf.Variable(0.0, trainable=False, name="reward_counter") reward_sum = tf.Variable(0.0, trainable=False, name="reward_sum") # increment the cumulative reward reward_counter = tf.assign_add( reward_counter, tf.to_float(decoder.batch_size * sample_size)) # sum over batch and samples reward_sum = tf.assign_add(reward_sum, tf.reduce_sum(samples_rewards_stacked)) # compute baseline: avg of previous rewards baseline = tf.div(reward_sum, tf.maximum(reward_counter, 1.0)) samples_rewards_stacked -= baseline tf.summary.scalar("train_{}/rl_reward_baseline".format( decoder.data_id), tf.reduce_mean(baseline), collections=["summary_train"]) if normalize: # normalize over sample space samples_logprobs_stacked = tf.nn.softmax(samples_logprobs_stacked * alpha, dim=0) scored_probs = tf.stop_gradient( tf.negative(samples_rewards_stacked)) * samples_logprobs_stacked # sum over samples total_loss = tf.reduce_sum(scored_probs, axis=0) # average over batch batch_loss = tf.reduce_mean(total_loss) if ce_smoothing > 0.0: batch_loss += tf.multiply(ce_smoothing, decoder.cost) tf.summary.scalar("train_{}/self_rl_cost".format(decoder.data_id), batch_loss, collections=["summary_train"]) return Objective(name="{}_rl".format(decoder.name), decoder=decoder, loss=batch_loss, gradients=None, weight=None)
def __init__( self, g2p_type: Union[None, str], non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, space_symbol: str = "<space>", remove_non_linguistic_symbols: bool = False, ): assert check_argument_types() if g2p_type is None: self.g2p = split_by_space elif g2p_type == "g2p_en": self.g2p = G2p_en(no_space=False) elif g2p_type == "g2p_en_no_space": self.g2p = G2p_en(no_space=True) elif g2p_type == "pyopenjtalk": self.g2p = pyopenjtalk_g2p elif g2p_type == "pyopenjtalk_kana": self.g2p = pyopenjtalk_g2p_kana elif g2p_type == "pyopenjtalk_accent": self.g2p = pyopenjtalk_g2p_accent elif g2p_type == "pyopenjtalk_accent_with_pause": self.g2p = pyopenjtalk_g2p_accent_with_pause elif g2p_type == "pyopenjtalk_prosody": self.g2p = pyopenjtalk_g2p_prosody elif g2p_type == "pypinyin_g2p": self.g2p = pypinyin_g2p elif g2p_type == "pypinyin_g2p_phone": self.g2p = pypinyin_g2p_phone elif g2p_type == "espeak_ng_arabic": self.g2p = Phonemizer( language="ar", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_german": self.g2p = Phonemizer( language="de", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_french": self.g2p = Phonemizer( language="fr-fr", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_spanish": self.g2p = Phonemizer( language="es", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_russian": self.g2p = Phonemizer( language="ru", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_greek": self.g2p = Phonemizer( language="el", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_finnish": self.g2p = Phonemizer( language="fi", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_hungarian": self.g2p = Phonemizer( language="hu", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_dutch": self.g2p = Phonemizer( language="nl", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_hindi": self.g2p = Phonemizer( language="hi", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "g2pk": self.g2p = G2pk(no_space=False) elif g2p_type == "g2pk_no_space": self.g2p = G2pk(no_space=True) elif g2p_type == "espeak_ng_english_us_vits": # VITS official implementation-like processing # Reference: https://github.com/jaywalnut310/vits self.g2p = Phonemizer( language="en-us", backend="espeak", with_stress=True, preserve_punctuation=True, strip=True, word_separator=" ", phone_separator="", split_by_single_token=True, ) else: raise NotImplementedError(f"Not supported: g2p_type={g2p_type}") self.g2p_type = g2p_type self.space_symbol = space_symbol if non_linguistic_symbols is None: self.non_linguistic_symbols = set() elif isinstance(non_linguistic_symbols, (Path, str)): non_linguistic_symbols = Path(non_linguistic_symbols) try: with non_linguistic_symbols.open("r", encoding="utf-8") as f: self.non_linguistic_symbols = set(line.rstrip() for line in f) except FileNotFoundError: warnings.warn(f"{non_linguistic_symbols} doesn't exist.") self.non_linguistic_symbols = set() else: self.non_linguistic_symbols = set(non_linguistic_symbols) self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, flexible_numspk: bool, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp] inf_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp] # get sample rate sample_rate, _ = ref_readers[0][keys[0]] # check keys if not flexible_numspk: for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() with DatadirWriter(output_dir) as writer: for key in keys: if not flexible_numspk: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] else: ref_audios = [ ref_reader[key][1] for ref_reader in ref_readers if key in ref_reader.keys() ] inf_audios = [ inf_reader[key][1] for inf_reader in inf_readers if key in inf_reader.keys() ] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output inf = inf[..., ref_channel] elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] if not flexible_numspk: assert ref.shape == inf.shape, (ref.shape, inf.shape) else: # epsilon value to avoid divergence # caused by zero-value, e.g., log(0) eps = 0.000001 # if num_spk of ref > num_spk of inf if ref.shape[0] > inf.shape[0]: p = np.full((ref.shape[0] - inf.shape[0], inf.shape[1]), eps) inf = np.concatenate([inf, p]) num_spk = ref.shape[0] # if num_spk of ref < num_spk of inf elif ref.shape[0] < inf.shape[0]: p = np.full((inf.shape[0] - ref.shape[0], ref.shape[1]), eps) ref = np.concatenate([ref, p]) num_spk = inf.shape[0] else: num_spk = ref.shape[0] sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True) for i in range(num_spk): stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate) estoi_score = stoi( ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True ) si_snr_score = -float( si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[int(perm[i])][None, ...]), ) ) writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100) # in percentage writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100) writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score) writer[f"SDR_spk{i + 1}"][key] = str(sdr[i]) writer[f"SAR_spk{i + 1}"][key] = str(sar[i]) writer[f"SIR_spk{i + 1}"][key] = str(sir[i]) # save permutation assigned script file if not flexible_numspk: writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
def train_one_epoch( cls, model: torch.nn.Module, iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], scaler: Optional[GradScaler], reporter: SubReporter, summary_writer, options: TrainerOptions, distributed_option: DistributedOption, ) -> bool: assert check_argument_types() grad_noise = options.grad_noise accum_grad = options.accum_grad grad_clip = options.grad_clip grad_clip_type = options.grad_clip_type log_interval = options.log_interval no_forward_run = options.no_forward_run ngpu = options.ngpu use_wandb = options.use_wandb distributed = distributed_option.distributed if log_interval is None: try: log_interval = max(len(iterator) // 20, 10) except TypeError: log_interval = 100 model.train() all_steps_are_invalid = True # [For distributed] Because iteration counts are not always equals between # processes, send stop-flag to the other processes if iterator is finished iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu") start_time = time.perf_counter() for iiter, (_, batch) in enumerate( reporter.measure_iter_time(iterator, "iter_time"), 1 ): assert isinstance(batch, dict), type(batch) if distributed: torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) if iterator_stop > 0: break batch = to_device(batch, "cuda" if ngpu > 0 else "cpu") if no_forward_run: all_steps_are_invalid = False continue with autocast(scaler is not None): with reporter.measure_time("forward_time"): retval = model(**batch) # Note(kamo): # Supporting two patterns for the returned value from the model # a. dict type if isinstance(retval, dict): loss = retval["loss"] stats = retval["stats"] weight = retval["weight"] optim_idx = retval.get("optim_idx") if optim_idx is not None and not isinstance(optim_idx, int): if not isinstance(optim_idx, torch.Tensor): raise RuntimeError( "optim_idx must be int or 1dim torch.Tensor, " f"but got {type(optim_idx)}" ) if optim_idx.dim() >= 2: raise RuntimeError( "optim_idx must be int or 1dim torch.Tensor, " f"but got {optim_idx.dim()}dim tensor" ) if optim_idx.dim() == 1: for v in optim_idx: if v != optim_idx[0]: raise RuntimeError( "optim_idx must be 1dim tensor " "having same values for all entries" ) optim_idx = optim_idx[0].item() else: optim_idx = optim_idx.item() # b. tuple or list type else: loss, stats, weight = retval optim_idx = None stats = {k: v for k, v in stats.items() if v is not None} if ngpu > 1 or distributed: # Apply weighted averaging for loss and stats loss = (loss * weight.type(loss.dtype)).sum() # if distributed, this method can also apply all_reduce() stats, weight = recursive_average(stats, weight, distributed) # Now weight is summation over all workers loss /= weight if distributed: # NOTE(kamo): Multiply world_size because DistributedDataParallel # automatically normalizes the gradient by world_size. loss *= torch.distributed.get_world_size() loss /= accum_grad reporter.register(stats, weight) with reporter.measure_time("backward_time"): if scaler is not None: # Scales loss. Calls backward() on scaled loss # to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same dtype autocast chose # for corresponding forward ops. scaler.scale(loss).backward() else: loss.backward() if iiter % accum_grad == 0: if scaler is not None: # Unscales the gradients of optimizer's assigned params in-place for iopt, optimizer in enumerate(optimizers): if optim_idx is not None and iopt != optim_idx: continue scaler.unscale_(optimizer) # gradient noise injection if grad_noise: add_gradient_noise( model, reporter.get_total_count(), duration=100, eta=1.0, scale_factor=0.55, ) # compute the gradient norm to check if it is normal or not grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), max_norm=grad_clip, norm_type=grad_clip_type, ) # PyTorch<=1.4, clip_grad_norm_ returns float value if not isinstance(grad_norm, torch.Tensor): grad_norm = torch.tensor(grad_norm) if not torch.isfinite(grad_norm): logging.warning( f"The grad norm is {grad_norm}. Skipping updating the model." ) # Must invoke scaler.update() if unscale_() is used in the iteration # to avoid the following error: # RuntimeError: unscale_() has already been called # on this optimizer since the last update(). # Note that if the gradient has inf/nan values, # scaler.step skips optimizer.step(). if scaler is not None: for iopt, optimizer in enumerate(optimizers): if optim_idx is not None and iopt != optim_idx: continue scaler.step(optimizer) scaler.update() else: all_steps_are_invalid = False with reporter.measure_time("optim_step_time"): for iopt, (optimizer, scheduler) in enumerate( zip(optimizers, schedulers) ): if optim_idx is not None and iopt != optim_idx: continue if scaler is not None: # scaler.step() first unscales the gradients of # the optimizer's assigned params. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() else: optimizer.step() if isinstance(scheduler, AbsBatchStepScheduler): scheduler.step() for iopt, optimizer in enumerate(optimizers): if optim_idx is not None and iopt != optim_idx: continue optimizer.zero_grad() # Register lr and train/load time[sec/step], # where step refers to accum_grad * mini-batch reporter.register( dict( { f"optim{i}_lr{j}": pg["lr"] for i, optimizer in enumerate(optimizers) for j, pg in enumerate(optimizer.param_groups) if "lr" in pg }, train_time=time.perf_counter() - start_time, ), ) start_time = time.perf_counter() # NOTE(kamo): Call log_message() after next() reporter.next() if iiter % log_interval == 0: logging.info(reporter.log_message(-log_interval)) if summary_writer is not None: reporter.tensorboard_add_scalar(summary_writer, -log_interval) if use_wandb: reporter.wandb_log() else: if distributed: iterator_stop.fill_(1) torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) return all_steps_are_invalid
def __init__( self, st_train_config: Union[Path, str] = None, st_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, enh_s2t_task: bool = False, ): assert check_argument_types() task = STTask if not enh_s2t_task else EnhS2TTask # 1. Build ST model scorers = {} st_model, st_train_args = task.build_model_from_file( st_train_config, st_model_file, device ) if enh_s2t_task: st_model.inherite_attributes( inherite_s2t_attrs=[ "ctc", "decoder", "eos", "joint_network", "sos", "token_list", "use_transducer_decoder", ] ) st_model.to(dtype=getattr(torch, dtype)).eval() decoder = st_model.decoder token_list = st_model.token_list scorers.update( decoder=decoder, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object weights = dict( decoder=1.0, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=st_model.sos, eos=st_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key="full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation." ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = st_train_args.token_type if bpemodel is None: bpemodel = st_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.st_model = st_model self.st_train_args = st_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__(self, encoders: List[Any], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float=1.0, rnn_size: Optional[int]=None, embedding_size: Optional[int]=None, output_projection: Optional[Callable[ [tf.Tensor, tf.Tensor, List[tf.Tensor]], tf.Tensor]]=None, encoder_projection: Optional[Callable[ [tf.Tensor, Optional[int], Optional[List[Any]]], tf.Tensor]]=None, use_attention: bool=False, embeddings_encoder: Optional[Any]=None, attention_on_input: bool=True, rnn_cell: str='GRU', conditional_gru: bool=False, save_checkpoint: Optional[str]=None, load_checkpoint: Optional[str]=None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder vocabulary: Target vocabulary data_id: Target data series name: Name of the decoder. Should be unique accross all Neural Monkey objects max_output_len: Maximum length of an output sequence dropout_keep_prob: Probability of keeping a value during dropout Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. embedding_size: Size of embedding vectors for target words output_projection: How to generate distribution over vocabulary from decoder rnn_outputs encoder_projection: How to construct initial state from encoders use_attention: Flag whether to look at attention vectors of the encoders embeddings_encoder: Encoder to take embeddings from rnn_cell: RNN Cell used by the decoder (GRU or LSTM) conditional_gru: Flag whether to use the Conditional GRU architecture attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) log("Initializing decoder, name: '{}'".format(name)) assert check_argument_types() self.encoders = encoders self.vocabulary = vocabulary self.data_id = data_id self.max_output_len = max_output_len self.dropout_keep_prob = dropout_keep_prob self.embedding_size = embedding_size self.rnn_size = rnn_size self.output_projection = output_projection self.encoder_projection = encoder_projection self.use_attention = use_attention self.embeddings_encoder = embeddings_encoder self._rnn_cell = rnn_cell if self.embedding_size is None and self.embeddings_encoder is None: raise ValueError("You must specify either embedding size or the " "encoder from which to reuse the embeddings (" "e.g. set either 'embedding_size' or " " 'embeddings_encoder' parameter)") if self.embeddings_encoder is not None: if self.embedding_size is not None: warn("Overriding the embedding_size parameter with the" " size of the reused embeddings from the encoder.") self.embedding_size = ( self.embeddings_encoder.embedding_matrix.get_shape()[1].value) if self.encoder_projection is None: if len(self.encoders) == 0: log("No encoder - language model only.") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.encoded.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self.output_projection is None: log("No output projection specified - using simple concatenation") self.output_projection = no_deep_output with tf.variable_scope(name): self._create_input_placeholders() self._create_training_placeholders() self._create_initial_state() self._create_embedding_matrix() with tf.name_scope("output_projection"): self.decoding_w = tf.get_variable( "state_to_word_W", [self.rnn_size, len(self.vocabulary)], initializer=tf.random_uniform_initializer(-0.5, 0.5)) self.decoding_b = tf.get_variable( "state_to_word_b", [len(self.vocabulary)], initializer=tf.constant_initializer( - math.log(len(self.vocabulary)))) # POSLEDNI TRAIN INPUT SE V DEKODOVACI FUNKCI NEPOUZIJE # (jen jako target) embedded_train_inputs = self._embed_and_dropout( self.train_inputs[:-1]) # POZOR TADY SE NEDELA DROPOUT embedded_go_symbols = tf.nn.embedding_lookup(self.embedding_matrix, self.go_symbols) # fetch train attention objects self._train_attention_objects = {} # type: Dict[Attentive, tf.Tensor] if self.use_attention: with tf.name_scope("attention_object"): self._train_attention_objects = { e: e.create_attention_object() for e in self.encoders if isinstance(e, Attentive)} train_rnn_outputs, _ = self._attention_decoder( embedded_go_symbols, attention_on_input=attention_on_input, conditional_gru=conditional_gru, train_inputs=embedded_train_inputs, train_mode=True) assert not tf.get_variable_scope().reuse tf.get_variable_scope().reuse_variables() # fetch runtime attention objects self._runtime_attention_objects = {} # type: Dict[Attentive, tf.Tensor] if self.use_attention: self._runtime_attention_objects = { e: e.create_attention_object() for e in self.encoders if isinstance(e, Attentive)} (self.runtime_rnn_outputs, self.runtime_rnn_states) = self._attention_decoder( embedded_go_symbols, attention_on_input=attention_on_input, conditional_gru=conditional_gru, train_mode=False) self.hidden_states = self.runtime_rnn_outputs def decode(rnn_outputs: List[tf.Tensor]) -> Tuple[ List[tf.Tensor], List[tf.Tensor]]: with tf.name_scope("output_projection"): logits = [] decoded = [] for out in rnn_outputs: out_activation = self._logit_function(out) logits.append(out_activation) decoded.append(tf.argmax(out_activation[:, 1:], 1) + 1) return decoded, logits _, self.train_logits = decode(train_rnn_outputs) train_targets = tf.transpose(self.train_inputs) self.train_xents = tf.contrib.seq2seq.sequence_loss( tf.stack(self.train_logits, 1), train_targets, tf.transpose(self.train_padding), average_across_batch=False) self.train_loss = tf.reduce_mean(self.train_xents) self.cost = self.train_loss self.train_logprobs = [tf.nn.log_softmax(l) for l in self.train_logits] self.decoded, self.runtime_logits = decode( self.runtime_rnn_outputs) self.runtime_loss = tf.contrib.seq2seq.sequence_loss( tf.stack(self.runtime_logits, 1), train_targets, tf.transpose(self.train_padding)) self.runtime_logprobs = [tf.nn.log_softmax(l) for l in self.runtime_logits] self._visualize_attention() log("Decoder initalized.")
def run( cls, model: AbsESPnetModel, optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], train_iter_factory: AbsIterFactory, valid_iter_factory: AbsIterFactory, plot_attention_iter_factory: Optional[AbsIterFactory], trainer_options, distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers)) if isinstance(trainer_options.keep_nbest_models, int): keep_nbest_models = [trainer_options.keep_nbest_models] else: if len(trainer_options.keep_nbest_models) == 0: logging.warning("No keep_nbest_models is given. Change to [1]") trainer_options.keep_nbest_models = [1] keep_nbest_models = trainer_options.keep_nbest_models output_dir = Path(trainer_options.output_dir) reporter = Reporter() if trainer_options.use_amp: if LooseVersion(torch.__version__) < LooseVersion("1.6.0"): raise RuntimeError( "Require torch>=1.6.0 for Automatic Mixed Precision" ) if trainer_options.sharded_ddp: if fairscale is None: raise RuntimeError( "Requiring fairscale. Do 'pip install fairscale'" ) scaler = fairscale.optim.grad_scaler.ShardedGradScaler() else: scaler = GradScaler() else: scaler = None if trainer_options.resume and (output_dir / "checkpoint.pth").exists(): cls.resume( checkpoint=output_dir / "checkpoint.pth", model=model, optimizers=optimizers, schedulers=schedulers, reporter=reporter, scaler=scaler, ngpu=trainer_options.ngpu, ) start_epoch = reporter.get_epoch() + 1 if start_epoch == trainer_options.max_epoch + 1: logging.warning( f"The training has already reached at max_epoch: {start_epoch}" ) if distributed_option.distributed: if trainer_options.sharded_ddp: dp_model = fairscale.nn.data_parallel.ShardedDataParallel( module=model, sharded_optimizer=optimizers, ) else: dp_model = torch.nn.parallel.DistributedDataParallel( model, device_ids=( # Perform multi-Process with multi-GPUs [torch.cuda.current_device()] if distributed_option.ngpu == 1 # Perform single-Process with multi-GPUs else None ), output_device=( torch.cuda.current_device() if distributed_option.ngpu == 1 else None ), find_unused_parameters=trainer_options.unused_parameters, ) elif distributed_option.ngpu > 1: dp_model = torch.nn.parallel.DataParallel( model, device_ids=list(range(distributed_option.ngpu)), ) else: # NOTE(kamo): DataParallel also should work with ngpu=1, # but for debuggability it's better to keep this block. dp_model = model if trainer_options.use_tensorboard and ( not distributed_option.distributed or distributed_option.dist_rank == 0 ): from torch.utils.tensorboard import SummaryWriter train_summary_writer = SummaryWriter( str(output_dir / "tensorboard" / "train") ) valid_summary_writer = SummaryWriter( str(output_dir / "tensorboard" / "valid") ) else: train_summary_writer = None start_time = time.perf_counter() for iepoch in range(start_epoch, trainer_options.max_epoch + 1): if iepoch != start_epoch: logging.info( "{}/{}epoch started. Estimated time to finish: {}".format( iepoch, trainer_options.max_epoch, humanfriendly.format_timespan( (time.perf_counter() - start_time) / (iepoch - start_epoch) * (trainer_options.max_epoch - iepoch + 1) ), ) ) else: logging.info(f"{iepoch}/{trainer_options.max_epoch}epoch started") set_all_random_seed(trainer_options.seed + iepoch) reporter.set_epoch(iepoch) # 1. Train and validation for one-epoch with reporter.observe("train") as sub_reporter: all_steps_are_invalid = cls.train_one_epoch( model=dp_model, optimizers=optimizers, schedulers=schedulers, iterator=train_iter_factory.build_iter(iepoch), reporter=sub_reporter, scaler=scaler, summary_writer=train_summary_writer, options=trainer_options, distributed_option=distributed_option, ) with reporter.observe("valid") as sub_reporter: cls.validate_one_epoch( model=dp_model, iterator=valid_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, distributed_option=distributed_option, ) if not distributed_option.distributed or distributed_option.dist_rank == 0: # att_plot doesn't support distributed if plot_attention_iter_factory is not None: with reporter.observe("att_plot") as sub_reporter: cls.plot_attention( model=model, output_dir=output_dir / "att_ws", summary_writer=train_summary_writer, iterator=plot_attention_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, ) # 2. LR Scheduler step for scheduler in schedulers: if isinstance(scheduler, AbsValEpochStepScheduler): scheduler.step( reporter.get_value(*trainer_options.val_scheduler_criterion) ) elif isinstance(scheduler, AbsEpochStepScheduler): scheduler.step() if trainer_options.sharded_ddp: for optimizer in optimizers: if isinstance(optimizer, fairscale.optim.oss.OSS): optimizer.consolidate_state_dict() if not distributed_option.distributed or distributed_option.dist_rank == 0: # 3. Report the results logging.info(reporter.log_message()) if trainer_options.use_matplotlib: reporter.matplotlib_plot(output_dir / "images") if train_summary_writer is not None: reporter.tensorboard_add_scalar(train_summary_writer, key1="train") reporter.tensorboard_add_scalar(valid_summary_writer, key1="valid") if trainer_options.use_wandb: reporter.wandb_log() # 4. Save/Update the checkpoint torch.save( { "model": model.state_dict(), "reporter": reporter.state_dict(), "optimizers": [o.state_dict() for o in optimizers], "schedulers": [ s.state_dict() if s is not None else None for s in schedulers ], "scaler": scaler.state_dict() if scaler is not None else None, }, output_dir / "checkpoint.pth", ) # 5. Save and log the model and update the link to the best model torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") # Creates a sym link latest.pth -> {iepoch}epoch.pth p = output_dir / "latest.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved = [] for _phase, k, _mode in trainer_options.best_model_criterion: # e.g. _phase, k, _mode = "train", "loss", "min" if reporter.has(_phase, k): best_epoch = reporter.get_best_epoch(_phase, k, _mode) # Creates sym links if it's the best result if best_epoch == iepoch: p = output_dir / f"{_phase}.{k}.best.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") else: logging.info( "The best model has been updated: " + ", ".join(_improved) ) log_model = ( trainer_options.wandb_model_log_interval > 0 and iepoch % trainer_options.wandb_model_log_interval == 0 ) if log_model and trainer_options.use_wandb: import wandb logging.info("Logging Model on this epoch :::::") artifact = wandb.Artifact( name=f"model_{wandb.run.id}", type="model", metadata={"improved": _improved}, ) artifact.add_file(str(output_dir / f"{iepoch}epoch.pth")) aliases = [ f"epoch-{iepoch}", "best" if best_epoch == iepoch else "", ] wandb.log_artifact(artifact, aliases=aliases) # 6. Remove the model files excluding n-best epoch and latest epoch _removed = [] # Get the union set of the n-best among multiple criterion nbests = set().union( *[ set(reporter.sort_epochs(ph, k, m)[: max(keep_nbest_models)]) for ph, k, m in trainer_options.best_model_criterion if reporter.has(ph, k) ] ) # Generated n-best averaged model if ( trainer_options.nbest_averaging_interval > 0 and iepoch % trainer_options.nbest_averaging_interval == 0 ): average_nbest_models( reporter=reporter, output_dir=output_dir, best_model_criterion=trainer_options.best_model_criterion, nbest=keep_nbest_models, ) for e in range(1, iepoch): p = output_dir / f"{e}epoch.pth" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) if len(_removed) != 0: logging.info("The model files were removed: " + ", ".join(_removed)) # 7. If any updating haven't happened, stops the training if all_steps_are_invalid: logging.warning( f"The gradients at all steps are invalid in this epoch. " f"Something seems wrong. This training was stopped at {iepoch}epoch" ) break # 8. Check early stopping if trainer_options.patience is not None: if reporter.check_early_stopping( trainer_options.patience, *trainer_options.early_stopping_criterion ): break else: logging.info( f"The training was finished at {trainer_options.max_epoch} epochs " ) # Generated n-best averaged model if not distributed_option.distributed or distributed_option.dist_rank == 0: average_nbest_models( reporter=reporter, output_dir=output_dir, best_model_criterion=trainer_options.best_model_criterion, nbest=keep_nbest_models, )
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, MultiHeadedAttention(attention_heads, output_size, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)
def __init__(self, name: str, encoders: List[Attendable], vocabulary: Vocabulary, data_id: str, # TODO infer the default for these three from the encoder ff_hidden_size: int, n_heads_self: int, n_heads_enc: Union[List[int], int], depth: int, max_output_len: int, attention_combination_strategy: str = "serial", n_heads_hier: int = None, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = True, label_smoothing: float = None, self_attention_dropout_keep_prob: float = 1.0, attention_dropout_keep_prob: Union[float, List[float]] = 1.0, use_att_transform_bias: bool = False, supress_unk: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a decoder of the Transformer model. Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762 Arguments: encoders: Input encoders for the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. ff_hidden_size: Size of the feedforward sublayers. n_heads_self: Number of the self-attention heads. n_heads_enc: Number of the attention heads over each encoder. Either a list which size must be equal to ``encoders``, or a single integer. In the latter case, the number of heads is equal for all encoders. attention_comnbination_strategy: One of ``serial``, ``parallel``, ``flat``, ``hierarchical``. Controls the attention combination strategy for enc-dec attention. n_heads_hier: Number of the attention heads for the second attention in the ``hierarchical`` attention combination. depth: Number of sublayers. label_smoothing: A label smoothing parameter for cross entropy loss computation. attention_dropout_keep_prob: Probability of keeping a value during dropout on the attention output. supress_unk: If true, decoder will not produce symbols for unknown tokens. reuse: Reuse the variables from the given model part. """ check_argument_types() AutoregressiveDecoder.__init__( self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) self.encoders = encoders self.ff_hidden_size = ff_hidden_size self.n_heads_self = n_heads_self if isinstance(n_heads_enc, int): if attention_combination_strategy == "flat": self.n_heads_enc = [n_heads_enc] else: self.n_heads_enc = [n_heads_enc for _ in self.encoders] else: self.n_heads_enc = n_heads_enc self.depth = depth if isinstance(attention_dropout_keep_prob, float): self.attention_dropout_keep_prob = [ attention_dropout_keep_prob for _ in encoders] else: self.attention_dropout_keep_prob = attention_dropout_keep_prob self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob self.use_att_transform_bias = use_att_transform_bias self.attention_combination_strategy = attention_combination_strategy self.n_heads_hier = n_heads_hier self.encoder_states = lambda: [get_attention_states(e) for e in self.encoders] self.encoder_masks = lambda: [get_attention_mask(e) for e in self.encoders] if self.attention_combination_strategy not in STRATEGIES: raise ValueError( "Unknown attention combination strategy '{}'. " "Allowed: {}.".format(self.attention_combination_strategy, ", ".join(STRATEGIES))) if (self.attention_combination_strategy == "hierarchical" and self.n_heads_hier is None): raise ValueError( "You must provide n_heads_hier when using the hierarchical " "attention combination strategy.") if (self.attention_combination_strategy != "hierarchical" and self.n_heads_hier is not None): warn("Ignoring n_heads_hier parameter -- use the hierarchical " "attention combination strategy instead.") if (self.attention_combination_strategy == "flat" and len(self.n_heads_enc) != 1): raise ValueError( "For the flat attention combination strategy, only a single " "value is permitted in n_heads_enc.") self._variable_scope.set_initializer(tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))
def add_resource_factory(self, factory_callback: factory_callback_type, types: Union[type, Sequence[Type]], name: str = 'default', context_attr: str = None) -> None: """ Add a resource factory to this context. This will cause a ``resource_added`` event to be dispatched. A resource factory is a callable that generates a "contextual" resource when it is requested by either using any of the methods :meth:`get_resource`, :meth:`require_resource` or :meth:`request_resource` or its context attribute is accessed. When a new resource is created in this manner, it is always bound to the context through it was requested, regardless of where in the chain the factory itself was added to. :param factory_callback: a (non-coroutine) callable that takes a context instance as argument and returns the created resource object :param types: one or more types to register the generated resource as on the target context :param name: name of the resource that will be created in the target context :param context_attr: name of the context attribute the created resource will be accessible as :raises asphalt.core.context.ResourceConflict: if there is an existing resource factory for the given type/name combinations or the given context variable """ assert check_argument_types() self._check_closed() if not resource_name_re.fullmatch(name): raise ValueError( '"name" must be a nonempty string consisting only of alphanumeric ' 'characters and underscores') if iscoroutinefunction(factory_callback): raise TypeError( '"factory_callback" must not be a coroutine function') if not types: raise ValueError('"types" must not be empty') if isinstance(types, type): resource_types = (types, ) # type: Tuple[type, ...] else: resource_types = tuple(types) # Check for a conflicting context attribute if context_attr in self._resource_factories_by_context_attr: raise ResourceConflict( 'this context already contains a resource factory for the context attribute {!r}' .format(context_attr)) # Check for conflicts with existing resource factories for type_ in resource_types: if (type_, name) in self._resource_factories: raise ResourceConflict( 'this context already contains a resource factory for the ' 'type {}'.format(qualified_name(type_))) # Add the resource factory to the appropriate lookup tables resource = ResourceContainer(factory_callback, resource_types, name, context_attr, True) for type_ in resource_types: self._resource_factories[(type_, name)] = resource if context_attr: self._resource_factories_by_context_attr[context_attr] = resource # Notify listeners that a new resource has been made available self.resource_added.dispatch(resource_types, name, True)
def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel: assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] # Overwriting token_list to keep it as "portable". args.token_list = list(token_list) elif isinstance(args.token_list, (tuple, list)): token_list = list(args.token_list) else: raise RuntimeError("token_list must be str or list") vocab_size = len(token_list) logging.info(f"Vocabulary size: {vocab_size }") # 1. frontend if args.input_size is None: # Extract features in the model frontend_class = frontend_choices.get_class(args.frontend) frontend = frontend_class(**args.frontend_conf) input_size = frontend.output_size() else: # Give features from data-loader args.frontend = None args.frontend_conf = {} frontend = None input_size = args.input_size # 2. Data augmentation for spectrogram if args.specaug is not None: specaug_class = specaug_choices.get_class(args.specaug) specaug = specaug_class(**args.specaug_conf) else: specaug = None # 3. Normalization layer if args.normalize is not None: normalize_class = normalize_choices.get_class(args.normalize) normalize = normalize_class(**args.normalize_conf) else: normalize = None # 4. Pre-encoder input block # NOTE(kan-bayashi): Use getattr to keep the compatibility if getattr(args, "preencoder", None) is not None: preencoder_class = preencoder_choices.get_class(args.preencoder) preencoder = preencoder_class(**args.preencoder_conf) input_size = preencoder.output_size() else: preencoder = None # 4. Encoder encoder_class = encoder_choices.get_class(args.encoder) encoder = encoder_class(input_size=input_size, **args.encoder_conf) # 5. Decoder decoder_class = decoder_choices.get_class(args.decoder) decoder = decoder_class( vocab_size=vocab_size, encoder_output_size=encoder.output_size(), **args.decoder_conf, ) # 6. CTC ctc = CTC( odim=vocab_size, encoder_output_sizse=encoder.output_size(), **args.ctc_conf ) # 7. RNN-T Decoder (Not implemented) rnnt_decoder = None # 8. Build model model = ESPnetASRModel( vocab_size=vocab_size, frontend=frontend, specaug=specaug, normalize=normalize, preencoder=preencoder, encoder=encoder, decoder=decoder, ctc=ctc, rnnt_decoder=rnnt_decoder, token_list=token_list, **args.model_conf, ) # FIXME(kamo): Should be done in model? # 9. Initialize if args.init is not None: initialize(model, args.init) assert check_return_type(model) return model