def update_weights(optimizer: tf.train.Optimizer, network: Network, batch, weight_decay: float): loss = 0 for image, actions, targets in batch: # Initial step, from the real observation. value, reward, policy_logits, hidden_state = network.initial_inference( image) predictions = [(1.0, value, reward, policy_logits)] # Recurrent steps, from action and previous hidden state. for action in actions: value, reward, policy_logits, hidden_state = network.recurrent_inference( hidden_state, action) predictions.append((1.0 / len(actions), value, reward, policy_logits)) hidden_state = tf.scale_gradient(hidden_state, 0.5) for prediction, target in zip(predictions, targets): gradient_scale, value, reward, policy_logits = prediction target_value, target_reward, target_policy = target l = ( scalar_loss(value, target_value) + scalar_loss(reward, target_reward) + tf.nn.softmax_cross_entropy_with_logits( logits=policy_logits, labels=target_policy)) loss += tf.scale_gradient(l, gradient_scale) for weights in network.get_weights(): loss += weight_decay * tf.nn.l2_loss(weights) optimizer.minimize(loss)
def update_weights(optimizer: tf.train.Optimizer, network: Network, batch, weight_decay: float): loss = 0 for image, (target_value, target_policy) in batch: value, policy_logits = network.inference(image) loss += (tf.losses.mean_squared_error(value, target_value) + tf.nn.softmax_cross_entropy_with_logits(logits=policy_logits, labels=target_policy)) for weights in network.get_weights(): loss += weight_decay * tf.nn.l2_loss(weights) optimizer.minimize(loss)
def train_step(model: tf.keras.Model, optimizer: tf.train.Optimizer, loss: loss, x: tf.Tensor, y: tf.Tensor): """Training operation. That is, we minimize the loss function here. Arguments: model {tf.keras.Model} -- Instance of tf.keras.Model optimizer {tf.train.Optimizer} -- Optimizer to be used. loss {loss} -- Loss function. x {tf.Tensor} -- Input features. y {tf.Tensor} -- Output labels. """ optimizer.minimize(loss=lambda: loss(model, x, y), global_step=tf.train.get_or_create_global_step())
def __init__( self, environment: ControlBenchmark, experience_buffer: BaseExperienceBuffer, tensorflow_session: tf.Session, gamma: float, layer_sizes: List[int], layer_activations: List[str], shared_layers: int, tau: float, optimizer: tf.train.Optimizer, batch_size: int, ) -> None: super().__init__(environment=environment, experience_buffer=experience_buffer) self.shared_layers = shared_layers self.tensorflow_session = tensorflow_session self.batch_size = batch_size self.Q = NAFNetwork(layer_sizes=layer_sizes, layer_activations=layer_activations, shared_layers=shared_layers, state_shape=environment.state_shape, action_shape=environment.action_shape) self.Q_lowpass = NAFNetwork(layer_sizes=layer_sizes, layer_activations=layer_activations, shared_layers=shared_layers, state_shape=environment.state_shape, action_shape=environment.action_shape) self.Q_lowpass.model.set_weights(self.Q.model.get_weights()) self.observation_input = tf.keras.Input( shape=self.environment.state_shape, name='state') self.next_observation_input = tf.keras.Input( shape=self.environment.state_shape, name='next_state') self.action_input = tf.keras.Input(shape=self.environment.action_shape, name='action_placeholder') self.reward_input = tf.keras.Input(shape=(), name='reward') self.terminal_input = tf.keras.Input(shape=(), name='terminal') self.p_continue = gamma * (1 - self.terminal_input) self.frozen_parameter_update_op = periodic_target_update( target_variables=self.Q_lowpass.model.variables, source_variables=self.Q.model.variables, update_period=1, tau=tau) self.q_values_policy, self.mu_policy, _ = self.Q( state_action=[self.observation_input, self.action_input]) _, _, self.vt_lowpass = self.Q_lowpass( state_action=[self.next_observation_input, self.action_input]) # action is not actually used here to calculate the value self.target = self.reward_input + self.p_continue * self.vt_lowpass rl_loss = tf.reduce_mean(0.5 * (self.q_values_policy - self.target)**2) self.train_op = optimizer.minimize(rl_loss) self._initialize_tf_variables()
def __init__( self, obs_spec: specs.Array, action_spec: specs.DiscreteArray, network: snt.AbstractModule, optimizer: tf.train.Optimizer, sequence_length: int, td_lambda: float, agent_discount: float, seed: int, ): """A simple actor-critic agent.""" del action_spec # unused tf.set_random_seed(seed) self._sequence_length = sequence_length self._count = 0 # Create the policy ops.. obs = tf.placeholder(shape=obs_spec.shape, dtype=obs_spec.dtype) online_logits, _ = network(tf.expand_dims(obs, 0)) action = tf.squeeze( tf.multinomial(online_logits, 1, output_dtype=tf.int32)) # Create placeholders and numpy arrays for learning from trajectories. shapes = [obs_spec.shape, (), (), ()] dtypes = [obs_spec.dtype, np.int32, np.float32, np.float32] placeholders = [ tf.placeholder(shape=(self._sequence_length, 1) + shape, dtype=dtype) for shape, dtype in zip(shapes, dtypes) ] observations, actions, rewards, discounts = placeholders self.arrays = [ np.zeros(shape=(self._sequence_length, 1) + shape, dtype=dtype) for shape, dtype in zip(shapes, dtypes) ] # Build actor and critic losses. logits, values = snt.BatchApply(network)(observations) _, bootstrap_value = network(tf.expand_dims(obs, 0)) critic_loss, (advantages, _) = td_lambda_loss( state_values=values, rewards=rewards, pcontinues=agent_discount * discounts, bootstrap_value=bootstrap_value, lambda_=td_lambda) actor_loss = discrete_policy_gradient_loss(logits, actions, advantages) train_op = optimizer.minimize(actor_loss + critic_loss) # Create TF session and callables. session = tf.Session() self._policy_fn = session.make_callable(action, [obs]) self._update_fn = session.make_callable(train_op, placeholders + [obs]) session.run(tf.global_variables_initializer())
def train(self, x_data: tf.Tensor, y_data: tf.Tensor, example_number: int, epochs: int, batch_size: int, activation_function: ActivationFunction, cost_function: CostFunction, optimizer_function: tf.train.Optimizer, ) -> None: assert x_data.shape[0] == y_data.shape[0] == example_number x_dataset = tf.data.Dataset.from_tensor_slices(x_data) y_dataset = tf.data.Dataset.from_tensor_slices(y_data) assert x_dataset.output_shapes == self.x_size assert y_dataset.output_shapes == self.y_size assert self.is_initialized self.activation_function = activation_function batched_x, batched_y = (i.batch(batch_size) for i in (x_dataset, y_dataset)) x_batch, y_batch = ( tf.placeholder('float', shape=(batch_size, self.x_size)), tf.placeholder('float', shape=(batch_size, self.y_size))) pred_y_batch = tf.map_fn(self.model, x_batch) cost = tf.reduce_mean(cost_function(pred_y_batch, y_batch)) optimizer = optimizer_function.minimize(cost) print('Training...') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(epochs): epoch_loss = 0 x_it, y_it = (b.make_one_shot_iterator() for b in (batched_x, batched_y)) curr_x_init = x_it.get_next() curr_y_init = y_it.get_next() while True: try: curr_x = sess.run(curr_x_init) curr_y = sess.run(curr_y_init) _, c = sess.run([optimizer, cost], feed_dict={ x_batch: curr_x, y_batch: curr_y }) epoch_loss += c except tf.errors.OutOfRangeError: break print('Epoch', epoch + 1, 'out of', epochs, 'completed.', 'Current loss:', epoch_loss) self.test_accuracy(sess, x_data, y_data, example_number)
def train_model(optimizer: tf.train.Optimizer, loss: tf.Tensor): """Minimize the loss with respect to the model variables. Args: optimizer (tf.train.Optimizer): loss (tf.Tensor): Loss value as defined by a loss function.. Returns: An Operation that updates the variables in `var_list` & also increments `global_step`. """ return optimizer.minimize(loss=loss, global_step=tf.train.get_or_create_global_step())
def adversarial_train_op_func( generator_loss: tf.Tensor, discriminator_loss: tf.Tensor, generator_weights: List[tf.Variable], discriminator_weights: List[tf.Variable], n_gen_steps: int = 1, n_disc_steps: int = 5, optimizer: tf.train.Optimizer = tf.train.RMSPropOptimizer(0.0005) ) -> tf.Operation: """ Build the adversarial train operation (n_disc_steps discriminator optimization steps followed by n_gen_steps generator optimization steps). Arguments: generator_loss -- generator loss. discriminator_loss -- discriminator loss. generator_weights -- list of generator trainable weights. discriminator_weights -- list of discriminator trainable weights. n_gen_steps -- number of generator update steps per single train operation, optional (default = 1). n_disc_steps -- number of discriminator update steps per single train operation, optional (default = 10). optimizer -- optimizer to use, optional (default = tf.train.RMSPropOptimizer(0.001)) """ disc_train_op = _op_repeat_n( lambda: optimizer.minimize(discriminator_loss, var_list=discriminator_weights), n_disc_steps ) with tf.control_dependencies([disc_train_op]): gen_train_op = _op_repeat_n( lambda: optimizer.minimize(generator_loss, var_list=generator_weights), n_gen_steps ) return gen_train_op
def feature_eval_setup(sess: Session, X: Tensor, Z: Tensor, data_train: DataSet, data_test: DataSet, eval_fn: Callable[[Tensor, Tensor], Tensor], eval_loss_fn: Callable[[Tensor, Tensor], Tensor], supervise_net: Optional[Callable[[Tensor], Tensor]] = None, optimizer: tf.train.Optimizer = ( tf.train.RMSPropOptimizer(learning_rate=1e-4)), mb_size: Optional[int] = 128, max_iter: int = 5000, restart_training: bool = True ) -> Callable[[Session], Tuple[Number, Number]]: with tf.variable_scope('feature_eval'): if supervise_net is not None: y_logits = supervise_net(Z) else: y_logits = dense_net(Z, [256, data_train.dim_y]) y_hat = tf.sigmoid(y_logits) y = tf.placeholder(tf.float32, [None] + data_train.dim_Y) eval_loss = tf.reduce_mean(eval_loss_fn(y_logits, y)) eval_result = eval_fn(y_hat, y) vars_fteval = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='feature_eval') train = optimizer.minimize(eval_loss, var_list=vars_fteval) eval_vars_initializer = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='feature_eval')) sess.run(eval_vars_initializer) def feature_eval(_sess: Session) -> Tuple[Number, Number]: if restart_training: _sess.run(eval_vars_initializer) for _ in tqdm(range(max_iter)): if mb_size is not None: _mb = data_train.sample(mb_size) else: _mb = data_train data_feed = {X: _mb.x, y: _mb.y} _sess.run(train, feed_dict=data_feed) data_feed = {X: data_test.x, y: data_test.y} val_eval_loss = _sess.run(eval_loss, feed_dict=data_feed) val_eval = _sess.run(eval_result, feed_dict=data_feed) return val_eval_loss, val_eval return feature_eval
def train_step(model: tf.keras.Model, optimizer: tf.train.Optimizer, loss_func: loss, inputs: tf.Tensor, labels: tf.Tensor, **kwargs): """Kicks off training for a given model. Args: model (tf.keras.Model): optimizer (tf.train.Optimizer): loss_func (loss): Loss function. inputs (tf.Tensor): Dataset's input features. labels (tf.Tensor): Dataset true labels. Keyword Args: sparse (bool): False if labels are not one-hot encoded. Returns: An Operation that updates the variables in `var_list`. If `global_step` was not `None`, that operation also increments `global_step`. """ return optimizer.minimize(loss=lambda: loss_func(model, inputs, labels, **kwargs), global_step=tf.train.get_or_create_global_step())
def __init__( self, obs_dim: int, latent_dim: int, encoder: Callable[[tf.Tensor, int], tf.Tensor], decoder: Callable[[tf.Tensor], tf.Tensor], decoder_loss: Callable[[tf.Tensor, tf.Tensor], tf.Tensor], optimiser: tf.train.Optimizer, seed: int, ) -> None: super().__init__(obs_dim, latent_dim) tf.set_random_seed(seed) obs_pl = tf.placeholder(tf.float32, [None, obs_dim]) latent_pl = tf.placeholder(tf.float32, [None, latent_dim]) latent_dist_params = encoder(obs_pl, latent_dim) latent = self._build_sampled_latent(latent_dist_params) generated_dist_params = decoder(latent) _fixed_prior = tf.random_normal([1, latent_dim]) _generated_dist_params = decoder(latent_pl) loss = self._kl_divergence(latent_dist_params) loss += decoder_loss(obs_pl, generated_dist_params) loss *= np.log2(np.e) train_op = optimiser.minimize(loss) session = tf.Session() self._prior_fn = session.make_callable(_fixed_prior) self._posterior_fn = session.make_callable(latent_dist_params, [obs_pl]) self._generator_fn = session.make_callable(_generated_dist_params, [latent_pl]) self._loss_fn = session.make_callable(loss, [obs_pl]) self._train_fn = session.make_callable(train_op, [obs_pl]) session.run(tf.global_variables_initializer()) self.session = session self.saver = tf.train.Saver()
def get_gradient_op(tensors: MDPTensors, objective_initial_scales: SRLObjectives, optimizer: tf.train.Optimizer, gradient_clip: Optional[float], **kwargs): objectives: SRLObjectives = SRLObjectives( value_function=ValueFunction(tensors, objective_initial_scales.value_function, **kwargs), reward_prediction=RewardPrediction( tensors, objective_initial_scales.reward_prediction, **kwargs), auto_encoding=AutoEncodingPrediction( tensors, objective_initial_scales.auto_encoding, **kwargs), forward_dynamics=ForwardDynamicsPrediction( tensors, objective_initial_scales.forward_dynamics, **kwargs), inverse_dynamics=InverseDynamicsPrediction( tensors, objective_initial_scales.inverse_dynamics, **kwargs), slowness=SlownessLoss(tensors, objective_initial_scales.slowness, **kwargs), diversity=DiversityLoss(tensors, objective_initial_scales.diversity, **kwargs), ) active_objectives = [ o for o in objectives if o is not None and backend.get_value(o.scale) > 0 ] total_loss = backend.mean( backend.stack([o.loss for o in active_objectives])) if gradient_clip is not None: gradients = optimizer.compute_gradients(total_loss) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, gradient_clip), var) return optimizer.apply_gradients(gradients) else: return optimizer.minimize(total_loss)
def __init__( self, obs_spec: dm_env.specs.Array, action_spec: dm_env.specs.BoundedArray, ensemble: Sequence[snt.AbstractModule], target_ensemble: Sequence[snt.AbstractModule], batch_size: int, agent_discount: float, replay_capacity: int, min_replay_size: int, sgd_period: int, target_update_period: int, optimizer: tf.train.Optimizer, mask_prob: float, noise_scale: float, epsilon_fn: Callable[[int], float] = lambda _: 0., seed: int = None, ): """Bootstrapped DQN with additive prior functions.""" # Dqn configurations. self._ensemble = ensemble self._target_ensemble = target_ensemble self._num_actions = action_spec.maximum - action_spec.minimum + 1 self._batch_size = batch_size self._sgd_period = sgd_period self._target_update_period = target_update_period self._min_replay_size = min_replay_size self._epsilon_fn = epsilon_fn self._replay = replay.Replay(capacity=replay_capacity) self._mask_prob = mask_prob self._noise_scale = noise_scale self._rng = np.random.RandomState(seed) tf.set_random_seed(seed) self._total_steps = 0 self._total_episodes = 0 self._active_head = 0 self._num_ensemble = len(ensemble) assert len(ensemble) == len(target_ensemble) # Making the tensorflow graph session = tf.Session() # Placeholders = (obs, action, reward, discount, next_obs, mask, noise) o_tm1 = tf.placeholder(shape=(None, ) + obs_spec.shape, dtype=obs_spec.dtype) a_tm1 = tf.placeholder(shape=(None, ), dtype=action_spec.dtype) r_t = tf.placeholder(shape=(None, ), dtype=tf.float32) d_t = tf.placeholder(shape=(None, ), dtype=tf.float32) o_t = tf.placeholder(shape=(None, ) + obs_spec.shape, dtype=obs_spec.dtype) m_t = tf.placeholder(shape=(None, self._num_ensemble), dtype=tf.float32) z_t = tf.placeholder(shape=(None, self._num_ensemble), dtype=tf.float32) losses = [] value_fns = [] target_updates = [] for k in range(self._num_ensemble): model = self._ensemble[k] target_model = self._target_ensemble[k] q_values = model(o_tm1) train_value = batched_index(q_values, a_tm1) target_value = tf.stop_gradient( tf.reduce_max(target_model(o_t), axis=-1)) target_y = r_t + z_t[:, k] + agent_discount * d_t * target_value loss = tf.square(train_value - target_y) * m_t[:, k] value_fn = session.make_callable(q_values, [o_tm1]) target_update = update_target_variables( target_variables=target_model.get_all_variables(), source_variables=model.get_all_variables(), ) losses.append(loss) value_fns.append(value_fn) target_updates.append(target_update) sgd_op = optimizer.minimize(tf.stack(losses)) self._value_fns = value_fns self._sgd_step = session.make_callable( sgd_op, [o_tm1, a_tm1, r_t, d_t, o_t, m_t, z_t]) self._update_target_nets = session.make_callable(target_updates) session.run(tf.global_variables_initializer())