def custom_on_train_batch_end(self, step, logs=None): if not hasattr(self.model.optimizer, 'lr'): raise ValueError('Optimizer must have a "lr" attribute.') lr = self._lr_schedule(step) if not (compat.is_tf_tensor(lr) or isinstance(lr, (float, numpy.float32, numpy.float64))): raise ValueError('The output of the "schedule" function ' 'should be float.') if compat.is_tf_tensor(lr) and not lr.dtype.is_floating: raise ValueError('The dtype of Tensor should be float') K.set_value(self.model.optimizer.lr, K.get_value(lr))
def __call__(self, model_inp, model_out): """ Calculates. Args: model_inp: A dict containing the model inputs. model_out: The logits tensor or a dict containing the logits tensor. The logits tensor with shape [batch, max_len, vocab_size]. Returns: The (nll_sum, num_of_samples(batch), num_of_tokens) with shape: nll_sum: [batch_size, ] num_of_samples: [1, ], num_of_tokens: [batch_size, ] """ logits = model_out if isinstance(model_out, dict): logits = model_out["logits"] elif not is_tf_tensor(model_out): raise ValueError("Not supported type of model_out: {}".format( type(model_out))) logits = tf.cast(logits, tf.float32) labels = model_inp["trg"] with tf.name_scope("loss"): vocab_size = logits.get_shape()[-1] confidence = 1.0 - self._label_smoothing low_confidence = self._label_smoothing / tf.cast( vocab_size - 1, tf.float32) soft_target = tf.one_hot(tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence) # this may cause NaN when meets bad sample xentropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=soft_target) # xentropy = - tf.reduce_sum(soft_target * tf.nn.log_softmax(logits), axis=-1) # Calculate the best (lowest) possible value of cross entropy, and # subtract from the cross entropy loss. if self._label_smoothing: normalizing_constant = -( confidence * tf.math.log(confidence) + tf.cast(vocab_size - 1, tf.float32) * low_confidence * tf.math.log(low_confidence + 1e-20)) xentropy -= normalizing_constant # else: # TODO(ZhaoChengqi) https://github.com/tensorflow/tensorflow/issues/32578 # xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # logits=logits, labels=labels) if "trg_padding" in model_inp: weights = tf.cast(1 - model_inp["trg_padding"], tf.float32) else: weights = input_length_to_nonpadding(model_inp["trg_length"], tf.shape(labels)[1], tf.float32) nll_sum = tf.reduce_sum(xentropy * weights, axis=1) n_samples = tf.cast(tf.expand_dims(tf.shape(labels)[0], axis=0), dtype=tf.float32) n_tokens = tf.reduce_sum(weights, axis=1) return nll_sum, n_samples, n_tokens
def _process_and_truncate_text(text): if data_status["transcript"] == compat.DataStatus.RAW: if compat.is_tf_tensor(text): text = text.numpy() text = self._trg_data_pipeline.process(text, is_processed=False) else: assert data_status["transcript"] == compat.DataStatus.PROJECTED if mode == compat.ModeKeys.TRAIN and trunc_trg and max_trg_len: if compat.is_tf_tensor(text): text = tf.cond( tf.less_equal(tf.size(text), max_trg_len), lambda: text, lambda: tf.concat( [text[:(max_trg_len - 1)], text[-1:]], axis=0)) else: if len(text) > max_trg_len: text = text[:(max_trg_len - 1)] + text[-1:] return text
def dynamic_tensorshape_except_last_dim(tensor): """ Returns a tf.TensorShape with only last dim having the static shape. """ shape_list = static_shape_list(tensor) # Only the last for i in range(len(shape_list) - 1): shape_list[i] = None if compat.is_tf_tensor(shape_list[-1]): shape_list[-1] = None return tf.TensorShape(shape_list)
def _process_and_truncate(text, dp, trunc, max_len): if data_status != compat.DataStatus.PROJECTED: text = dp.process(text, is_processed=(data_status == compat.DataStatus.PROCESSED)) if mode == compat.ModeKeys.TRAIN and trunc and max_len: if compat.is_tf_tensor(text): text = tf.cond( tf.less_equal(tf.size(text), max_len), lambda: text, lambda: tf.concat([text[:(max_len - 1)], text[-1:]], axis=0)) elif len(text) > max_len: text = text[:(max_len - 1)] + text[-1:] return text
def data_proc(data, with_label): feature = _process_audio(data["audio"]) ret = { "audio": feature, "audio_length": tf.cast((tf.shape(feature)[0] if compat.is_tf_tensor(feature) else feature.shape[0]) // self._audio_feature_dim // self._audio_feature_channels, dtype=tf.int64) } if with_label: ret["transcript"] = tf.convert_to_tensor( _process_and_truncate_text(data["transcript"]), tf.int64) return ret
def __call__(self, spectrogram, true_length=None): """ Applies specaug. Args: spectrogram: A numpy.ndarray of shape [nframes, nfeatures] or a tf.Tensor of shape [nframes, nfeatures] or [batch, nframes, nfeatures]. true_length: A tf.Tensor of shape [batch, ] if `spectrogram` is a tensor of shape [batch, nframes, nfeatures], else None. Returns: A tuple (augmented spectrogram, new true_length) if ndims of `spectrogram` is 3, else the augmented spectrogram. """ if is_tf_tensor(spectrogram): return self._call_tf(spectrogram) return self._call_numpy(spectrogram)
def stack_beam_size(x, beam_size): """ Tiles a given tensor by beam_size. Args: x: A tensor with shape [batch_size, ...]. beam_size: An int scalar. Returns: The tiled tensor with shape [batch_size * beam_size, ...] Raises: AssertionError: if the shape of tensor does not match [batch_size, 1, 1, timesteps] when tensor.ndims == 4. NotImplementedError: if tensor.ndims > 4. """ assert compat.is_tf_tensor(x) original_shape = tf.shape(x) x = tf.expand_dims(x, axis=1) tile_dims = [1] * x.shape.ndims tile_dims[1] = beam_size tiled_x = tf.tile(x, tile_dims) tiled_shape = tf.concat([[-1], original_shape[1:]], axis=0) return tf.reshape(tiled_x, tiled_shape)
def run(self): """ Training a neural model. Step 1: Create training model Step 2: Restore checkpoint/pretrain model/global_step if exists. Step 3: Fetch training data. Step 5: Fetch training training. Step 6: TRAIN!!! """ if self._hvd_backend == "horovod": import horovod.tensorflow.keras as hvd elif self._hvd_backend == "byteps": import byteps.tensorflow.keras as hvd tfds = training_utils.build_datasets(compat.ModeKeys.TRAIN, self.strategy, self.custom_dataset, self.task) if isinstance(self.custom_dataset, MultipleDataset): _tfds = None for _, ds in tfds.items(): if _tfds is None: _tfds = ds else: _tfds = _tfds.concatenate(ds) tfds = _tfds tfds = tfds.prefetch(tf.data.experimental.AUTOTUNE) # Step 1: create a model with training_utils.get_strategy_scope(self.strategy): inps = self.task.create_inputs(compat.ModeKeys.TRAIN) formatted_inps = self.task.example_to_input( inps, compat.ModeKeys.TRAIN) model_out = self.model(formatted_inps, is_training=True) for metric_layer in self.task.build_metric_layer(): model_out = metric_layer([formatted_inps, model_out]) if (LooseVersion(tf.__version__) < LooseVersion("2.3") or LooseVersion(tf.__version__) >= LooseVersion("2.5")): logging.info( f"Warning: Need further check on AccumgradKerasModel when TF version={tf.__version__}. " f"Here we ignore update_cycle={self._update_cycle}, " f"clip_value={self._clip_value}, clip_norm={self._clip_norm}." ) keras_model = tf.keras.Model(inps, model_out) elif compat.IS_PREV_TF_2_4_0: from neurst.training.gradaccum_keras_model import TF23GradAccumKerasModel keras_model = TF23GradAccumKerasModel( inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) else: keras_model = GradAccumKerasModel( inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) loss = self._criterion.reduce_loss(formatted_inps, model_out) if compat.is_tf_tensor(loss) or isinstance(loss, (list, tuple)): keras_model.add_loss(loss) elif isinstance(loss, dict): for _name, _loss in loss.items(): keras_model.add_loss(_loss) keras_model.add_metric(_loss, name=_name + "_mean", aggregation="mean") else: raise ValueError("criterion.reduce_loss returns " "unsupported value of type: {}".format( type(loss))) self._restore_ckpt_or_pretrain() self._lr_schedule = build_lr_schedule(self._lr_schedule_args) if self._pruning_schedule is not None: self._optimizer = create_pruning_optimizer( self._optimizer, self.model, self._pruning_schedule, pruning_variable_pattern=self._pruning_variable_pattern, nopruning_variable_pattern=self. _nopruning_variable_pattern, keep_prune_property=True) self._optimizer = training_utils.handle_fp16_and_distributed_optimizer( self._optimizer, self._lr_schedule, self._hvd_backend) if self._hvd_backend is None: keras_model.compile(self._optimizer) else: # NOTE: we already add Horovod DistributedOptimizer in `_handle_fp16_and_distributed_optimizer`. # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. keras_model.compile(self._optimizer, experimental_run_tf_function=False) keras_model.summary() summary_model_variables(self.model, self._freeze_variables) # initialize the checkpoint manager _ = compat.get_saver_or_default( self.model, self.model_dir, max_to_keep=self._checkpoints_max_to_keep) # build training training if not self._tb_log_dir: self._tb_log_dir = os.path.join(self.model_dir, "train") training_callbacks = [ MetricReductionCallback(self.strategy, self._summary_steps, self._tb_log_dir, device="GPU:0", lr_schedule=self._lr_schedule) ] if self._hvd_backend is None or hvd.rank() == 0: training_callbacks.append( CustomCheckpointCallback( self.task.model_configs(self.model), save_checkpoint_steps=self._save_checkpoint_steps)) if self._validator is not None: training_callbacks.append( self._validator.build(self.strategy, self.task, self.model)) if self._hvd_backend is not None: # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based training. # NOTE!!! HERE we already integrate the metric averaging behaviour into the MetricReductionCallback. # training_callbacks.insert(0, hvd.callbacks.MetricAverageCallback(device="GPU:0")) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. training_callbacks.insert( 0, hvd.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0")) if self._lr_schedule is not None: training_callbacks.append( LearningRateScheduler(self._lr_schedule)) if self._experimental_count_batch_num: logging.info("Scanning the dataset......") iterator = iter( training_utils.maybe_distribution_dataset(self.strategy, tfds)) cnt = 0 for _ in iterator: cnt += 1 logging.info(f"Total {cnt} batches per EPOCH.") history = keras_model.fit( map_data_for_keras(tfds.repeat()), initial_epoch=0, epochs=1, steps_per_epoch=self._train_steps, # * args["update_cycle"], verbose=2, callbacks=training_callbacks) logging.info(history.history)
def _convert_to_tensor(gradient): if is_tf_tensor(gradient): return gradient return tf.convert_to_tensor(gradient)