def _train_step( self, inputs, targets, batch: int, batch_size: int, optimizer: tf.keras.optimizers, loss_fn: tf.keras.losses, ): with tf.GradientTape() as tape: outputs = self.model(inputs, training=True) # Calculate the training prediction tokens predictions = self.model.predictions( outputs, self.train_dataloader.encoder_target ) # Calculate the loss and update the parameters loss = loss_fn(targets, outputs) gradients = tape.gradient(loss, self.model.trainable_variables) optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) metric = self.recorded_losses["train"] metric(loss) if base.Metrics.ABSOLUTE_ACC in self.metrics: self._record_abs_acc(outputs, targets, batch, batch_size, "train") logger.info(f"Batch #{batch} : training loss {metric.result()}") return predictions
def train_step(model: tf.keras.Model, x_batch: tf.Tensor, y_batch: tf.Tensor, optimizer: tf.keras.optimizers) -> tf.keras.metrics.Mean: with tf.GradientTape() as tape: loss = loss_fn(model, x_batch, y_batch) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) return train_loss(loss)
def trainer( j_hat: JHat, datasets: Tuple[tf.keras.utils.Sequence, tf.keras.utils.Sequence], optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam, learning_rate: float = 1e-3, preprocess_fn: Callable = None, epochs: int = 20, reg_loss_fn: Callable = (lambda kernel: 0), verbose: int = 1, ) -> None: """ Train the kernel to maximise an estimate of test power using minibatch gradient descent. """ ds_ref, ds_cur = datasets optimizer = optimizer(learning_rate) n_minibatch = min(len(ds_ref), len(ds_cur)) # iterate over epochs loss_ma = 0. for epoch in range(epochs): if verbose: pbar = tf.keras.utils.Progbar(n_minibatch, 1) for step, (x_ref, x_cur) in enumerate(zip(ds_ref, ds_cur)): if isinstance(preprocess_fn, Callable): # type: ignore x_ref, x_cur = preprocess_fn(x_ref), preprocess_fn(x_cur) with tf.GradientTape() as tape: estimate = j_hat(x_ref, x_cur) loss = -estimate + reg_loss_fn(j_hat.kernel) # ascent grads = tape.gradient(loss, j_hat.trainable_weights) optimizer.apply_gradients(zip(grads, j_hat.trainable_weights)) if verbose == 1: loss_ma = loss_ma + (loss - loss_ma) / (step + 1) pbar_values = [('loss', loss_ma)] pbar.add(1, values=pbar_values)
def explainer_train_step( index: tf.Tensor, X: tf.Tensor, y: tf.Tensor, sample_weights: tf.Tensor, explainer: tf.keras.Model, optimizer: tf.keras.optimizers, ): with tf.GradientTape() as tape: loss = explainer_mse_loss(explainer(index, X), y, sample_weights) gradients = tape.gradient(loss, explainer.trainable_variables) optimizer.apply_gradients(zip(gradients, explainer.trainable_variables))
def train_iteration(initial_state: tf.Tensor, gamma: float, max_iter_step: tf.int32, actor_model: tf.keras.Model, critic_model: tf.keras.Model, optimizer: tf.keras.optimizers) -> List[tf.Tensor]: state_curr = initial_state state_shape = initial_state.shape step_reward = 0 avg_reward = 0. # state_curr = tf.expand_dims(state_curr, 0) for curr_step in tf.range(max_iter_step): with tf.GradientTape(persistent=True) as tape: action_net_out = actor_model(state_curr) value_curr_out = critic_model(state_curr) action_net_out = tf.reshape(action_net_out, action_shape) action_prob_softmax = tf.nn.softmax(action_net_out) action = tf.random.categorical(tf.math.log(action_prob_softmax), 1) action = tf.squeeze(action) state_next, step_reward = tf_env_step(action) state_next.set_shape(state_shape) state_next = tf.expand_dims(state_next, 0) value_next_out = critic_model(state_next) avg_reward = 0.99 * avg_reward + \ 0.01 * tf.cast(step_reward, tf.float32) TD_err = tf.cast(step_reward, dtype=tf.float32) - avg_reward + \ gamma * value_next_out - value_curr_out loss_actor = -TD_err * tf.math.reduce_sum([ tf.math.log(action_prob_softmax[i, j]) for i, j in enumerate(action) ]) loss_critic = tf.square(TD_err) grads_actor = tape.gradient(loss_actor, actor_model.trainable_variables) grads_critic = tape.gradient(loss_critic, critic_model.trainable_variables) optimizer.apply_gradients( zip(grads_actor, actor_model.trainable_variables)) optimizer.apply_gradients( zip(grads_critic, critic_model.trainable_variables)) state_curr = state_next if curr_step % 100 == 1: print(f'Step: {curr_step}, average rewards = {avg_reward}' f' elite num = {len(env.elite_list)}\n') return [avg_reward, len(env.elite_list)]
def exponentialDecay(self, optimizer: tf.keras.optimizers, k: float, epoch: int): self.learning_rate = optimizer.learning_rate if self.learning_rate > 1e-5: optimizer.learning_rate = self.learning_rate * np.exp([k * epoch ])[0] self.learning_rate = optimizer.learning_rate return optimizer
def trainer( model: tf.keras.Model, loss_fn: tf.keras.losses, X_train: np.ndarray, y_train: np.ndarray = None, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam( learning_rate=1e-3), loss_fn_kwargs: dict = None, preprocess_fn: Callable = None, epochs: int = 20, batch_size: int = 64, buffer_size: int = 1024, verbose: bool = True, log_metric: Tuple[str, "tf.keras.metrics"] = None, callbacks: tf.keras.callbacks = None ) -> None: # TODO: incorporate callbacks + LR schedulers """ Train TensorFlow model. Parameters ---------- model Model to train. loss_fn Loss function used for training. X_train Training batch. y_train Training labels. optimizer Optimizer used for training. loss_fn_kwargs Kwargs for loss function. preprocess_fn Preprocessing function applied to each training batch. epochs Number of training epochs. batch_size Batch size used for training. buffer_size Maximum number of elements that will be buffered when prefetching. verbose Whether to print training progress. log_metric Additional metrics whose progress will be displayed if verbose equals True. callbacks Callbacks used during training. """ # create dataset if y_train is None: # unsupervised model without teacher forcing train_data = X_train else: train_data = (X_train, y_train) train_data = tf.data.Dataset.from_tensor_slices(train_data) train_data = train_data.shuffle(buffer_size=buffer_size).batch(batch_size) n_minibatch = int(np.ceil(X_train.shape[0] / batch_size)) # iterate over epochs for epoch in range(epochs): if verbose: pbar = tf.keras.utils.Progbar(n_minibatch, 1) # iterate over the batches of the dataset for step, train_batch in enumerate(train_data): if y_train is None: X_train_batch = train_batch else: X_train_batch, y_train_batch = train_batch if isinstance(preprocess_fn, Callable): # type: ignore X_train_batch = preprocess_fn(X_train_batch) with tf.GradientTape() as tape: preds = model(X_train_batch) if y_train is None: ground_truth = X_train_batch else: ground_truth = y_train_batch # compute loss if tf.is_tensor(preds): args = [ground_truth, preds] else: args = [ground_truth] + list(preds) if loss_fn_kwargs: loss = loss_fn(*args, **loss_fn_kwargs) else: loss = loss_fn(*args) if model.losses: # additional model losses loss += sum(model.losses) grads = tape.gradient(loss, model.trainable_weights) optimizer.apply_gradients(zip(grads, model.trainable_weights)) if verbose: loss_val = loss.numpy() if loss_val.shape: if loss_val.shape[0] != batch_size: if len(loss_val.shape) == 1: shape = (batch_size - loss_val.shape[0], ) elif len(loss_val.shape) == 2: shape = (batch_size - loss_val.shape[0], loss_val.shape[1]) # type: ignore add_mean = np.ones(shape) * loss_val.mean() loss_val = np.r_[loss_val, add_mean] pbar_values = [('loss', loss_val)] if log_metric is not None: log_metric[1](ground_truth, preds) pbar_values.append( (log_metric[0], log_metric[1].result().numpy())) pbar.add(1, values=pbar_values)
def trainer(model: tf.keras.Model, loss_fn: tf.keras.losses, X_train: np.ndarray, y_train: np.ndarray = None, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam( learning_rate=1e-3), loss_fn_kwargs: dict = None, epochs: int = 100, batch_size: int = 1, buffer_size: int = 1024, shuffle: bool = False, verbose: bool = True) -> None: """ Train TensorFlow model. Parameters ---------- model Model to train. loss_fn Loss function used for training. X_train Training batch. y_train Training labels. optimizer Optimizer used for training. loss_fn_kwargs Kwargs for loss function. epochs Number of training epochs. batch_size Batch size used for training. buffer_size Maximum number of elements that will be buffered when prefetching. shuffle Whether to shuffle training data. verbose Whether to print training progress. """ # create dataset if y_train is None: # unsupervised model train_data = X_train else: train_data = (X_train, y_train) train_data = tf.data.Dataset.from_tensor_slices(train_data) if shuffle: train_data = train_data.shuffle( buffer_size=buffer_size).batch(batch_size) n_minibatch = int(np.ceil(X_train.shape[0] / batch_size)) # iterate over epochs for epoch in range(epochs): if verbose: pbar = tf.keras.utils.Progbar(n_minibatch, 1) # iterate over the batches of the dataset for step, train_batch in enumerate(train_data): if y_train is None: X_train_batch = train_batch else: X_train_batch, y_train_batch = train_batch with tf.GradientTape() as tape: preds = model(X_train_batch) if y_train is None: ground_truth = X_train_batch else: ground_truth = y_train_batch # compute loss if tf.is_tensor(preds): args = [ground_truth, preds] else: args = [ground_truth] + list(preds) if loss_fn_kwargs: loss = loss_fn(*args, **loss_fn_kwargs) else: loss = loss_fn(*args) if model.losses: # additional model losses loss += sum(model.losses) grads = tape.gradient(loss, model.trainable_weights) optimizer.apply_gradients(zip(grads, model.trainable_weights)) if verbose: loss_val = loss.numpy().mean() pbar_values = [('loss', loss_val)] pbar.add(1, values=pbar_values)
def update_weights(optimizer: tf.keras.optimizers, network: BaseNetwork, batch): def scale_gradient(tensor, scale: float): """Trick function to scale the gradient in tensorflow""" return (1. - scale) * tf.stop_gradient(tensor) + scale * tensor def loss(): loss = 0 image_batch, targets_init_batch, targets_time_batch, actions_time_batch, \ mask_time_batch, dynamic_mask_time_batch = batch # make initial step from the real observation: representation + prediction networks representation_batch, value_batch, policy_batch = network.initial_model(np.array(image_batch)) # Only update the element with a policy target target_value_batch, _, target_policy_batch = zip(*targets_init_batch) mask_policy = list(map(lambda l: bool(l), target_policy_batch)) target_policy_batch = list(filter(lambda l: bool(l), target_policy_batch)) policy_batch = tf.boolean_mask(policy_batch, mask_policy) # Compute the loss of the first pass value_support_size = len(value_batch[0]) loss += tf.math.reduce_mean(loss_value(target_value_batch, value_batch, value_support_size)) loss += tf.math.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=policy_batch, labels=target_policy_batch)) # Recurrent steps, from action and previous hidden state. for actions_batch, targets_batch, mask, dynamic_mask in zip(actions_time_batch, targets_time_batch, mask_time_batch, dynamic_mask_time_batch): target_value_batch, target_reward_batch, target_policy_batch = zip(*targets_batch) # Only execute BPTT for elements with an action representation_batch = tf.boolean_mask(representation_batch, dynamic_mask) target_value_batch = tf.boolean_mask(target_value_batch, mask) target_reward_batch = tf.boolean_mask(target_reward_batch, mask) # Creating conditioned_representation: concatenate representations with actions batch actions_batch = tf.one_hot(actions_batch, network.action_size) # Recurrent step from conditioned representation: recurrent + prediction networks conditioned_representation_batch = tf.concat((representation_batch, actions_batch), axis=1) representation_batch, reward_batch, value_batch, policy_batch = network.recurrent_model( conditioned_representation_batch) # Only execute BPTT for elements with a policy target target_policy_batch = [policy for policy, b in zip(target_policy_batch, mask) if b] mask_policy = list(map(lambda l: bool(l), target_policy_batch)) target_policy_batch = tf.convert_to_tensor([policy for policy in target_policy_batch if policy]) policy_batch = tf.boolean_mask(policy_batch, mask_policy) # Compute the partial loss l = (tf.math.reduce_mean(loss_value(target_value_batch, value_batch, network.value_support_size)) + MSE(target_reward_batch, tf.squeeze(reward_batch)) + tf.math.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=policy_batch, labels=target_policy_batch))) # Scale the gradient of the loss by the average number of actions unrolled gradient_scale = 1. / len(actions_time_batch) loss += scale_gradient(l, gradient_scale) # Half the gradient of the representation representation_batch = scale_gradient(representation_batch, 0.5) return loss optimizer.minimize(loss=loss, var_list=network.cb_get_variables()) network.training_steps += 1
def trainer(model: tf.keras.Model, loss_fn: tf.keras.losses, x_train: np.ndarray, y_train: np.ndarray = None, dataset: tf.keras.utils.Sequence = None, optimizer: tf.keras.optimizers = tf.keras.optimizers.Adam( learning_rate=1e-3), loss_fn_kwargs: dict = None, preprocess_fn: Callable = None, epochs: int = 20, reg_loss_fn: Callable = (lambda model: 0), batch_size: int = 64, buffer_size: int = 1024, verbose: bool = True, log_metric: Tuple[str, "tf.keras.metrics"] = None, callbacks: tf.keras.callbacks = None) -> None: """ Train TensorFlow model. Parameters ---------- model Model to train. loss_fn Loss function used for training. x_train Training data. y_train Training labels. dataset Training dataset which returns (x, y). optimizer Optimizer used for training. loss_fn_kwargs Kwargs for loss function. preprocess_fn Preprocessing function applied to each training batch. epochs Number of training epochs. reg_loss_fn Allows an additional regularisation term to be defined as reg_loss_fn(model) batch_size Batch size used for training. buffer_size Maximum number of elements that will be buffered when prefetching. verbose Whether to print training progress. log_metric Additional metrics whose progress will be displayed if verbose equals True. callbacks Callbacks used during training. """ return_xy = False if not isinstance( dataset, tf.keras.utils.Sequence) and y_train is None else True if not isinstance(dataset, tf.keras.utils.Sequence): # create dataset train_data = x_train if y_train is None else (x_train, y_train) dataset = tf.data.Dataset.from_tensor_slices(train_data) dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size) n_minibatch = len(dataset) if loss_fn_kwargs: loss_fn = partial(loss_fn, **loss_fn_kwargs) # iterate over epochs for epoch in range(epochs): if verbose: pbar = tf.keras.utils.Progbar(n_minibatch, 1) if hasattr(dataset, 'on_epoch_end'): dataset.on_epoch_end() loss_val_ma = 0. for step, data in enumerate(dataset): x, y = data if return_xy else (data, None) if isinstance(preprocess_fn, Callable): # type: ignore x = preprocess_fn(x) with tf.GradientTape() as tape: y_hat = model(x) y = x if y is None else y if isinstance(loss_fn, Callable): # type: ignore args = [y, y_hat ] if tf.is_tensor(y_hat) else [y] + list(y_hat) loss = loss_fn(*args) else: loss = 0. if model.losses: # additional model losses loss += sum(model.losses) loss += reg_loss_fn( model) # alternative way they might be specified grads = tape.gradient(loss, model.trainable_weights) optimizer.apply_gradients(zip(grads, model.trainable_weights)) if verbose: loss_val = loss.numpy() if loss_val.shape: if loss_val.shape[0] != batch_size: if len(loss_val.shape) == 1: shape = (batch_size - loss_val.shape[0], ) elif len(loss_val.shape) == 2: shape = (batch_size - loss_val.shape[0], loss_val.shape[1]) # type: ignore add_mean = np.ones(shape) * loss_val.mean() loss_val = np.r_[loss_val, add_mean] loss_val_ma = loss_val_ma + (loss_val - loss_val_ma) / (step + 1) pbar_values = [('loss_ma', loss_val_ma)] if log_metric is not None: log_metric[1](y, y_hat) pbar_values.append( (log_metric[0], log_metric[1].result().numpy())) pbar.add(1, values=pbar_values)
def run( self, loss_fn: tf.keras.losses, optimizer: tf.keras.optimizers, batch_size: int, num_epoch: int, checkpoint=None, ): """Pretraining session.""" logger.info("Creating datasets...") train_dataset = self.train_dataloader.create_dataset() valid_dataset = self.valid_dataloader.create_dataset() train_dataset = self.model.preprocessing(train_dataset) valid_dataset = self.model.preprocessing(valid_dataset) logger.info("Creating results directory...") directory = os.path.join( "results/" + self.model.title + "_mono", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ) os.makedirs(directory, exist_ok=True) if checkpoint is not None: logger.info(f"Loading model {checkpoint}") self.model.load(str(checkpoint)) else: checkpoint = 0 logger.info("Beginning pretraining session...") for epoch in range(checkpoint + 1, num_epoch + 1): logger.debug(f"Epoch {epoch}...") for i, minibatch in enumerate( train_dataset.padded_batch( batch_size, padded_shapes=self.model.padded_shapes)): logger.debug(minibatch) with tf.GradientTape() as tape: loss = self._step(minibatch, i, loss_fn, "train", training=True) gradients = tape.gradient(loss, self.model.trainable_variables) optimizer.apply_gradients( zip(gradients, self.model.trainable_variables)) logger.debug("Saving training loss") self.history.record("train_loss", self.losses["train"].result()) for i, minibatch in enumerate( valid_dataset.padded_batch( batch_size, padded_shapes=self.model.padded_shapes)): loss = self._step(minibatch, i, loss_fn, "valid", training=False) logger.debug("Saving validation loss") self.history.record("valid_loss", self.losses["valid"].result()) self.model.save(epoch) self.losses["train"].reset_states() self.losses["valid"].reset_states() self.history.save(directory + f"/history-{epoch}")
def _calculate_and_apply_gradients(model: tf.keras.Sequential, optimizer: tf.keras.optimizers, gradient_tape: tf.GradientTape, loss: [float]): gradients = gradient_tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables))
def apply_gradients(optimizer: tf.keras.optimizers, gradients: tf.Tensor, variables): optimizer.apply_gradients(zip(gradients, variables))