Ejemplo n.º 1
0
def update_weights(optimizer: tf.train.Optimizer, network: Network, batch,
                   weight_decay: float):
  loss = 0
  for image, actions, targets in batch:
    # Initial step, from the real observation.
    value, reward, policy_logits, hidden_state = network.initial_inference(
        image)
    predictions = [(1.0, value, reward, policy_logits)]

    # Recurrent steps, from action and previous hidden state.
    for action in actions:
      value, reward, policy_logits, hidden_state = network.recurrent_inference(
          hidden_state, action)
      predictions.append((1.0 / len(actions), value, reward, policy_logits))

      hidden_state = tf.scale_gradient(hidden_state, 0.5)

    for prediction, target in zip(predictions, targets):
      gradient_scale, value, reward, policy_logits = prediction
      target_value, target_reward, target_policy = target

      l = (
          scalar_loss(value, target_value) +
          scalar_loss(reward, target_reward) +
          tf.nn.softmax_cross_entropy_with_logits(
              logits=policy_logits, labels=target_policy))

      loss += tf.scale_gradient(l, gradient_scale)

  for weights in network.get_weights():
    loss += weight_decay * tf.nn.l2_loss(weights)

  optimizer.minimize(loss)
Ejemplo n.º 2
0
def update_weights(optimizer: tf.train.Optimizer, network: Network, batch,
                   weight_decay: float):
    loss = 0
    for image, (target_value, target_policy) in batch:
        value, policy_logits = network.inference(image)
        loss += (tf.losses.mean_squared_error(value, target_value) +
                 tf.nn.softmax_cross_entropy_with_logits(logits=policy_logits,
                                                         labels=target_policy))

    for weights in network.get_weights():
        loss += weight_decay * tf.nn.l2_loss(weights)

    optimizer.minimize(loss)
Ejemplo n.º 3
0
def train_step(model: tf.keras.Model, optimizer: tf.train.Optimizer,
               loss: loss, x: tf.Tensor, y: tf.Tensor):
    """Training operation. That is, we minimize the loss function here.

    Arguments:
        model {tf.keras.Model} -- Instance of tf.keras.Model
        optimizer {tf.train.Optimizer} -- Optimizer to be used.
        loss {loss} -- Loss function.
        x {tf.Tensor} -- Input features.
        y {tf.Tensor} -- Output labels.
    """
    optimizer.minimize(loss=lambda: loss(model, x, y),
                       global_step=tf.train.get_or_create_global_step())
def dns_grad_op(loss, optimizer: tf.train.Optimizer, variables=None, global_step=None):
    """ Create an operation the updates the weights by gradient descent.

    In DNS, the weights are updated according to their derivative with respect to the masked
    values, but the update is applied to the non-masked values, so that zeroed-out weights may
    still change and in particular be spliced back in if necessary.

    Parameters
    ----------
    loss: A `tf.Tensor` representing the loss.
    optimizer: The optimizer to use.
    variables: The variables for which to create the gradient operation.
    global_step: An optional global step to increment.

    Returns
    -------
    train_op: An tensorflow op that when run updates the variables according to the gradient.
    """
    if variables is None:
        variables = tf.trainable_variables()

    replaced = {}

    wrt_variables = []

    num_replaced = 0

    for v in variables:
        # look for variables having shadow values.
        mvs = tf.get_collection(MASKED_WEIGHT_COLLECTION, v.op.name)

        if len(mvs) == 0:
            wrt_variables.append(v)
        elif len(mvs) == 1:
            num_replaced += 1
            wrt_variables.append(mvs[0])
            replaced[mvs[0]] = v
        else:
            raise ValueError('More than one masked weight for a given variable.')

    tf.logging.info('Replaced {0} variables for Dynamic Network Surgery'.format(num_replaced))

    grads_and_vars = optimizer.compute_gradients(loss, wrt_variables)
    grads_and_vars = [(g, replaced.get(v, v)) for g, v in grads_and_vars]

    train_op = optimizer.apply_gradients(grads_and_vars, global_step, 'dns_grad_op')

    return train_op
    def __init__(
        self,
        environment: ControlBenchmark,
        experience_buffer: BaseExperienceBuffer,
        tensorflow_session: tf.Session,
        gamma: float,
        layer_sizes: List[int],
        layer_activations: List[str],
        shared_layers: int,
        tau: float,
        optimizer: tf.train.Optimizer,
        batch_size: int,
    ) -> None:
        super().__init__(environment=environment,
                         experience_buffer=experience_buffer)
        self.shared_layers = shared_layers
        self.tensorflow_session = tensorflow_session
        self.batch_size = batch_size

        self.Q = NAFNetwork(layer_sizes=layer_sizes,
                            layer_activations=layer_activations,
                            shared_layers=shared_layers,
                            state_shape=environment.state_shape,
                            action_shape=environment.action_shape)
        self.Q_lowpass = NAFNetwork(layer_sizes=layer_sizes,
                                    layer_activations=layer_activations,
                                    shared_layers=shared_layers,
                                    state_shape=environment.state_shape,
                                    action_shape=environment.action_shape)

        self.Q_lowpass.model.set_weights(self.Q.model.get_weights())

        self.observation_input = tf.keras.Input(
            shape=self.environment.state_shape, name='state')
        self.next_observation_input = tf.keras.Input(
            shape=self.environment.state_shape, name='next_state')
        self.action_input = tf.keras.Input(shape=self.environment.action_shape,
                                           name='action_placeholder')
        self.reward_input = tf.keras.Input(shape=(), name='reward')
        self.terminal_input = tf.keras.Input(shape=(), name='terminal')

        self.p_continue = gamma * (1 - self.terminal_input)

        self.frozen_parameter_update_op = periodic_target_update(
            target_variables=self.Q_lowpass.model.variables,
            source_variables=self.Q.model.variables,
            update_period=1,
            tau=tau)

        self.q_values_policy, self.mu_policy, _ = self.Q(
            state_action=[self.observation_input, self.action_input])
        _, _, self.vt_lowpass = self.Q_lowpass(
            state_action=[self.next_observation_input, self.action_input])
        # action is not actually used here to calculate the value

        self.target = self.reward_input + self.p_continue * self.vt_lowpass
        rl_loss = tf.reduce_mean(0.5 * (self.q_values_policy - self.target)**2)
        self.train_op = optimizer.minimize(rl_loss)

        self._initialize_tf_variables()
Ejemplo n.º 6
0
    def train(self,
              x_data: tf.Tensor,
              y_data: tf.Tensor,
              example_number: int,
              epochs: int,
              batch_size: int,
              activation_function: ActivationFunction,
              cost_function: CostFunction,
              optimizer_function: tf.train.Optimizer,
              ) -> None:

        assert x_data.shape[0] == y_data.shape[0] == example_number

        x_dataset = tf.data.Dataset.from_tensor_slices(x_data)
        y_dataset = tf.data.Dataset.from_tensor_slices(y_data)

        assert x_dataset.output_shapes == self.x_size
        assert y_dataset.output_shapes == self.y_size
        assert self.is_initialized

        self.activation_function = activation_function

        batched_x, batched_y = (i.batch(batch_size)
                                for i in (x_dataset, y_dataset))

        x_batch, y_batch = (
            tf.placeholder('float', shape=(batch_size, self.x_size)),
            tf.placeholder('float', shape=(batch_size, self.y_size)))

        pred_y_batch = tf.map_fn(self.model, x_batch)
        cost = tf.reduce_mean(cost_function(pred_y_batch, y_batch))
        optimizer = optimizer_function.minimize(cost)

        print('Training...')
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(epochs):
                epoch_loss = 0
                x_it, y_it = (b.make_one_shot_iterator()
                              for b in (batched_x, batched_y))

                curr_x_init = x_it.get_next()
                curr_y_init = y_it.get_next()

                while True:
                    try:
                        curr_x = sess.run(curr_x_init)
                        curr_y = sess.run(curr_y_init)
                        _, c = sess.run([optimizer, cost], feed_dict={
                            x_batch: curr_x, y_batch: curr_y
                        })
                        epoch_loss += c
                    except tf.errors.OutOfRangeError:
                        break
                print('Epoch', epoch + 1, 'out of', epochs, 'completed.',
                      'Current loss:', epoch_loss)
            self.test_accuracy(sess, x_data, y_data, example_number)
Ejemplo n.º 7
0
    def __init__(
        self,
        obs_spec: specs.Array,
        action_spec: specs.DiscreteArray,
        network: snt.AbstractModule,
        optimizer: tf.train.Optimizer,
        sequence_length: int,
        td_lambda: float,
        agent_discount: float,
        seed: int,
    ):
        """A simple actor-critic agent."""
        del action_spec  # unused
        tf.set_random_seed(seed)
        self._sequence_length = sequence_length
        self._count = 0

        # Create the policy ops..
        obs = tf.placeholder(shape=obs_spec.shape, dtype=obs_spec.dtype)
        online_logits, _ = network(tf.expand_dims(obs, 0))
        action = tf.squeeze(
            tf.multinomial(online_logits, 1, output_dtype=tf.int32))

        # Create placeholders and numpy arrays for learning from trajectories.
        shapes = [obs_spec.shape, (), (), ()]
        dtypes = [obs_spec.dtype, np.int32, np.float32, np.float32]

        placeholders = [
            tf.placeholder(shape=(self._sequence_length, 1) + shape,
                           dtype=dtype)
            for shape, dtype in zip(shapes, dtypes)
        ]
        observations, actions, rewards, discounts = placeholders

        self.arrays = [
            np.zeros(shape=(self._sequence_length, 1) + shape, dtype=dtype)
            for shape, dtype in zip(shapes, dtypes)
        ]

        # Build actor and critic losses.
        logits, values = snt.BatchApply(network)(observations)
        _, bootstrap_value = network(tf.expand_dims(obs, 0))

        critic_loss, (advantages, _) = td_lambda_loss(
            state_values=values,
            rewards=rewards,
            pcontinues=agent_discount * discounts,
            bootstrap_value=bootstrap_value,
            lambda_=td_lambda)
        actor_loss = discrete_policy_gradient_loss(logits, actions, advantages)
        train_op = optimizer.minimize(actor_loss + critic_loss)

        # Create TF session and callables.
        session = tf.Session()
        self._policy_fn = session.make_callable(action, [obs])
        self._update_fn = session.make_callable(train_op, placeholders + [obs])
        session.run(tf.global_variables_initializer())
Ejemplo n.º 8
0
def ClippingOptimizer(opt: tf.train.Optimizer, low, high):
    original = opt.apply_gradients

    def apply_gradients(grads_and_vars, *a, **kw):
        app = original(grads_and_vars, *a, **kw)
        asg = [
            v.assign_add(tf.maximum(high - v, 0) + tf.minimum(low - v, 0))
            for g, v in grads_and_vars
        ]
        return tf.group(app, *asg)  # note that clipping is asynchronous here

    opt.apply_gradients = apply_gradients
    return opt
Ejemplo n.º 9
0
def train_model(optimizer: tf.train.Optimizer, loss: tf.Tensor):
    """Minimize the loss with respect to the model variables.

    Args:
        optimizer (tf.train.Optimizer):
        loss (tf.Tensor): Loss value as defined by a loss function..

    Returns:
        An Operation that updates the variables in `var_list`
        & also increments `global_step`.
    """
    return optimizer.minimize(loss=loss,
                              global_step=tf.train.get_or_create_global_step())
Ejemplo n.º 10
0
 def __distribute_training(self, iterator: tf.data.Iterator,
                           optimizer: tf.train.Optimizer) -> DistributedOps:
     gpus_to_use = self.__get_gpu_to_use()
     gradients, loss_operations = [], []
     for gpu_id in gpus_to_use:
         multi_gpu_operations = self.__place_operations(
             target_gpu_id=gpu_id, iterator=iterator, optimizer=optimizer)
         gradients.append(multi_gpu_operations.gradient)
         loss_operations.append(multi_gpu_operations.loss_operation)
     gradients = average_gradients(gradients)
     loss_operation = average_loss(loss_operations)
     training_step = optimizer.apply_gradients(gradients)
     return loss_operation, training_step
Ejemplo n.º 11
0
def adversarial_train_op_func(
        generator_loss: tf.Tensor,
        discriminator_loss: tf.Tensor,
        generator_weights: List[tf.Variable],
        discriminator_weights: List[tf.Variable],
        n_gen_steps: int = 1,
        n_disc_steps: int = 5,
        optimizer: tf.train.Optimizer = tf.train.RMSPropOptimizer(0.0005)
    ) -> tf.Operation:
    """
    Build the adversarial train operation (n_disc_steps discriminator optimization steps
    followed by n_gen_steps generator optimization steps).

    Arguments:

    generator_loss -- generator loss.
    discriminator_loss -- discriminator loss.
    generator_weights -- list of generator trainable weights.
    discriminator_weights -- list of discriminator trainable weights.
    n_gen_steps -- number of generator update steps per single train operation,
        optional (default = 1).
    n_disc_steps -- number of discriminator update steps per single train
        operation, optional (default = 10).
    optimizer -- optimizer to use, optional (default = tf.train.RMSPropOptimizer(0.001))
    """
    disc_train_op = _op_repeat_n(
            lambda: optimizer.minimize(discriminator_loss, var_list=discriminator_weights),
            n_disc_steps
        )
    
    with tf.control_dependencies([disc_train_op]):
        gen_train_op = _op_repeat_n(
                lambda: optimizer.minimize(generator_loss, var_list=generator_weights),
                n_gen_steps
            )
    
    return gen_train_op
Ejemplo n.º 12
0
def get_gradient_op(tensors: MDPTensors,
                    objective_initial_scales: SRLObjectives,
                    optimizer: tf.train.Optimizer,
                    gradient_clip: Optional[float], **kwargs):
    objectives: SRLObjectives = SRLObjectives(
        value_function=ValueFunction(tensors,
                                     objective_initial_scales.value_function,
                                     **kwargs),
        reward_prediction=RewardPrediction(
            tensors, objective_initial_scales.reward_prediction, **kwargs),
        auto_encoding=AutoEncodingPrediction(
            tensors, objective_initial_scales.auto_encoding, **kwargs),
        forward_dynamics=ForwardDynamicsPrediction(
            tensors, objective_initial_scales.forward_dynamics, **kwargs),
        inverse_dynamics=InverseDynamicsPrediction(
            tensors, objective_initial_scales.inverse_dynamics, **kwargs),
        slowness=SlownessLoss(tensors, objective_initial_scales.slowness,
                              **kwargs),
        diversity=DiversityLoss(tensors, objective_initial_scales.diversity,
                                **kwargs),
    )

    active_objectives = [
        o for o in objectives
        if o is not None and backend.get_value(o.scale) > 0
    ]
    total_loss = backend.mean(
        backend.stack([o.loss for o in active_objectives]))

    if gradient_clip is not None:
        gradients = optimizer.compute_gradients(total_loss)
        for i, (grad, var) in enumerate(gradients):
            if grad is not None:
                gradients[i] = (tf.clip_by_norm(grad, gradient_clip), var)
        return optimizer.apply_gradients(gradients)
    else:
        return optimizer.minimize(total_loss)
Ejemplo n.º 13
0
def feature_eval_setup(sess: Session,
                       X: Tensor,
                       Z: Tensor,
                       data_train: DataSet,
                       data_test: DataSet,
                       eval_fn: Callable[[Tensor, Tensor], Tensor],
                       eval_loss_fn: Callable[[Tensor, Tensor], Tensor],
                       supervise_net: Optional[Callable[[Tensor], Tensor]] = None,
                       optimizer: tf.train.Optimizer = (
                               tf.train.RMSPropOptimizer(learning_rate=1e-4)),
                       mb_size: Optional[int] = 128,
                       max_iter: int = 5000,
                       restart_training: bool = True
                       ) -> Callable[[Session], Tuple[Number, Number]]:
    with tf.variable_scope('feature_eval'):
        if supervise_net is not None:
            y_logits = supervise_net(Z)
        else:
            y_logits = dense_net(Z, [256, data_train.dim_y])

    y_hat = tf.sigmoid(y_logits)
    y = tf.placeholder(tf.float32, [None] + data_train.dim_Y)
    eval_loss = tf.reduce_mean(eval_loss_fn(y_logits, y))
    eval_result = eval_fn(y_hat, y)
    vars_fteval = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='feature_eval')
    train = optimizer.minimize(eval_loss, var_list=vars_fteval)
    eval_vars_initializer = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='feature_eval'))
    sess.run(eval_vars_initializer)

    def feature_eval(_sess: Session) -> Tuple[Number, Number]:
        if restart_training:
            _sess.run(eval_vars_initializer)
        for _ in tqdm(range(max_iter)):
            if mb_size is not None:
                _mb = data_train.sample(mb_size)
            else:
                _mb = data_train
            data_feed = {X: _mb.x, y: _mb.y}
            _sess.run(train, feed_dict=data_feed)
        data_feed = {X: data_test.x, y: data_test.y}
        val_eval_loss = _sess.run(eval_loss, feed_dict=data_feed)
        val_eval = _sess.run(eval_result, feed_dict=data_feed)
        return val_eval_loss, val_eval

    return feature_eval
Ejemplo n.º 14
0
def ClippingOptimizer(opt: tf.train.Optimizer, low, high):
    original = opt.apply_gradients

    def apply_gradients(grads_and_vars, *a, **kw):
        app = original(grads_and_vars, *a, **kw)
        with tf.name_scope('clip'):
            # clip = [v.assign_add(tf.maximum(high-v, 0)+tf.minimum(low-v, 0)) for g, v in grads_and_vars]
            clip = [
                v.assign(tf.clip_by_value(v, low, high))
                for g, v in grads_and_vars
            ]

        step = after(app, clip, name='step')
        return step

    opt.apply_gradients = apply_gradients
    return opt
Ejemplo n.º 15
0
def train_op_with_clip_and_noise(
        optimizer: tf.train.Optimizer,
        grads_and_vars: _GRAD_AND_VARS_TYPE,
        global_step: Optional[tf.Tensor] = None,
        gradient_clip: Optional[float] = None,
        gradient_noise_std: Optional[float] = None,
        gradient_l2_norm: Optional[tf.Tensor] = None) -> tf.Operation:
    """
    Produce train op for gradients and variables with
    gradient clip and adding of gradient noise if they were provided inside of
    optim config

    Parameters
    ----------
    optimizer
        optimizer to use
    grads_and_vars
        list of (gradient, variable)
    global_step
        global step to use in the optimizer; Caution: provide global_step only
        once, if you execute this method multiple times in one session
    gradient_clip
        gradient clip value
    gradient_noise_std
        standard deviation of the noise to add to gradients
    gradient_l2_norm
        gradient l2 norm used for the gradient clipping

    Returns
    -------
    train_op
        training operation, which can be used inside of session run
    """
    if gradient_clip is not None:
        grads_and_vars = clip_grads_and_vars(grads_and_vars, gradient_clip,
                                             gradient_l2_norm)
    if gradient_noise_std is not None:
        grads_and_vars = add_noise_to_grads_and_vars(grads_and_vars,
                                                     gradient_noise_std)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)
    return train_op
Ejemplo n.º 16
0
def train_step(model: tf.keras.Model, optimizer: tf.train.Optimizer,
               loss_func: loss, inputs: tf.Tensor, labels: tf.Tensor, **kwargs):
    """Kicks off training for a given model.

    Args:
        model (tf.keras.Model):
        optimizer (tf.train.Optimizer):
        loss_func (loss): Loss function.
        inputs (tf.Tensor): Dataset's input features.
        labels (tf.Tensor): Dataset true labels.

    Keyword Args:
            sparse (bool): False if labels are not one-hot encoded.

    Returns:
        An Operation that updates the variables in `var_list`.  If `global_step`
              was not `None`, that operation also increments `global_step`.
    """

    return optimizer.minimize(loss=lambda: loss_func(model, inputs, labels, **kwargs),
                              global_step=tf.train.get_or_create_global_step())
Ejemplo n.º 17
0
    def __init__(
        self,
        obs_dim: int,
        latent_dim: int,
        encoder: Callable[[tf.Tensor, int], tf.Tensor],
        decoder: Callable[[tf.Tensor], tf.Tensor],
        decoder_loss: Callable[[tf.Tensor, tf.Tensor], tf.Tensor],
        optimiser: tf.train.Optimizer,
        seed: int,
    ) -> None:
        super().__init__(obs_dim, latent_dim)

        tf.set_random_seed(seed)

        obs_pl = tf.placeholder(tf.float32, [None, obs_dim])
        latent_pl = tf.placeholder(tf.float32, [None, latent_dim])
        latent_dist_params = encoder(obs_pl, latent_dim)
        latent = self._build_sampled_latent(latent_dist_params)
        generated_dist_params = decoder(latent)
        _fixed_prior = tf.random_normal([1, latent_dim])
        _generated_dist_params = decoder(latent_pl)

        loss = self._kl_divergence(latent_dist_params)
        loss += decoder_loss(obs_pl, generated_dist_params)
        loss *= np.log2(np.e)
        train_op = optimiser.minimize(loss)

        session = tf.Session()
        self._prior_fn = session.make_callable(_fixed_prior)
        self._posterior_fn = session.make_callable(latent_dist_params,
                                                   [obs_pl])
        self._generator_fn = session.make_callable(_generated_dist_params,
                                                   [latent_pl])
        self._loss_fn = session.make_callable(loss, [obs_pl])
        self._train_fn = session.make_callable(train_op, [obs_pl])
        session.run(tf.global_variables_initializer())
        self.session = session
        self.saver = tf.train.Saver()
Ejemplo n.º 18
0
    def __init__(
        self,
        obs_spec: dm_env.specs.Array,
        action_spec: dm_env.specs.BoundedArray,
        ensemble: Sequence[snt.AbstractModule],
        target_ensemble: Sequence[snt.AbstractModule],
        batch_size: int,
        agent_discount: float,
        replay_capacity: int,
        min_replay_size: int,
        sgd_period: int,
        target_update_period: int,
        optimizer: tf.train.Optimizer,
        mask_prob: float,
        noise_scale: float,
        epsilon_fn: Callable[[int], float] = lambda _: 0.,
        seed: int = None,
    ):
        """Bootstrapped DQN with additive prior functions."""
        # Dqn configurations.
        self._ensemble = ensemble
        self._target_ensemble = target_ensemble
        self._num_actions = action_spec.maximum - action_spec.minimum + 1
        self._batch_size = batch_size
        self._sgd_period = sgd_period
        self._target_update_period = target_update_period
        self._min_replay_size = min_replay_size
        self._epsilon_fn = epsilon_fn
        self._replay = replay.Replay(capacity=replay_capacity)
        self._mask_prob = mask_prob
        self._noise_scale = noise_scale
        self._rng = np.random.RandomState(seed)
        tf.set_random_seed(seed)

        self._total_steps = 0
        self._total_episodes = 0
        self._active_head = 0
        self._num_ensemble = len(ensemble)
        assert len(ensemble) == len(target_ensemble)

        # Making the tensorflow graph
        session = tf.Session()

        # Placeholders = (obs, action, reward, discount, next_obs, mask, noise)
        o_tm1 = tf.placeholder(shape=(None, ) + obs_spec.shape,
                               dtype=obs_spec.dtype)
        a_tm1 = tf.placeholder(shape=(None, ), dtype=action_spec.dtype)
        r_t = tf.placeholder(shape=(None, ), dtype=tf.float32)
        d_t = tf.placeholder(shape=(None, ), dtype=tf.float32)
        o_t = tf.placeholder(shape=(None, ) + obs_spec.shape,
                             dtype=obs_spec.dtype)
        m_t = tf.placeholder(shape=(None, self._num_ensemble),
                             dtype=tf.float32)
        z_t = tf.placeholder(shape=(None, self._num_ensemble),
                             dtype=tf.float32)

        losses = []
        value_fns = []
        target_updates = []
        for k in range(self._num_ensemble):
            model = self._ensemble[k]
            target_model = self._target_ensemble[k]
            q_values = model(o_tm1)

            train_value = batched_index(q_values, a_tm1)
            target_value = tf.stop_gradient(
                tf.reduce_max(target_model(o_t), axis=-1))
            target_y = r_t + z_t[:, k] + agent_discount * d_t * target_value
            loss = tf.square(train_value - target_y) * m_t[:, k]

            value_fn = session.make_callable(q_values, [o_tm1])
            target_update = update_target_variables(
                target_variables=target_model.get_all_variables(),
                source_variables=model.get_all_variables(),
            )

            losses.append(loss)
            value_fns.append(value_fn)
            target_updates.append(target_update)

        sgd_op = optimizer.minimize(tf.stack(losses))
        self._value_fns = value_fns
        self._sgd_step = session.make_callable(
            sgd_op, [o_tm1, a_tm1, r_t, d_t, o_t, m_t, z_t])
        self._update_target_nets = session.make_callable(target_updates)
        session.run(tf.global_variables_initializer())
Ejemplo n.º 19
0
  def __init__(
      self,
      obs_spec: specs.Array,
      action_spec: specs.DiscreteArray,
      network: snt.RNNCore,
      optimizer: tf.train.Optimizer,
      sequence_length: int,
      td_lambda: float,
      agent_discount: float,
      seed: int,
  ):
    """A recurrent actor-critic agent."""
    del action_spec  # unused
    tf.set_random_seed(seed)
    self._sequence_length = sequence_length
    self._num_transitions_in_buffer = 0

    # Create the policy ops.
    obs = tf.placeholder(shape=(1,) + obs_spec.shape, dtype=obs_spec.dtype)
    mask = tf.placeholder(shape=(1,), dtype=tf.float32)
    state = self._placeholders_like(network.initial_state(batch_size=1))
    (online_logits, _), next_state = network((obs, mask), state)
    action = tf.squeeze(tf.multinomial(online_logits, 1, output_dtype=tf.int32))

    # Create placeholders and numpy arrays for learning from trajectories.
    shapes = [obs_spec.shape, (), (), (), ()]
    dtypes = [obs_spec.dtype, np.int32, np.float32, np.float32, np.float32]

    placeholders = [
        tf.placeholder(shape=(self._sequence_length, 1) + shape, dtype=dtype)
        for shape, dtype in zip(shapes, dtypes)]
    observations, actions, rewards, discounts, masks = placeholders

    # Build actor and critic losses.
    (logits, values), final_state = tf.nn.dynamic_rnn(
        network, (observations, tf.expand_dims(masks, -1)),
        initial_state=state, dtype=tf.float32, time_major=True)
    (_, bootstrap_value), _ = network((obs, mask), final_state)
    values, bootstrap_value = tree.map_structure(
        lambda t: tf.squeeze(t, axis=-1), (values, bootstrap_value))
    critic_loss, (advantages, _) = td_lambda_loss(
        state_values=values,
        rewards=rewards,
        pcontinues=agent_discount * discounts,
        bootstrap_value=bootstrap_value,
        lambda_=td_lambda)
    actor_loss = discrete_policy_gradient_loss(logits, actions, advantages)

    # Updates.
    grads_and_vars = optimizer.compute_gradients(actor_loss + critic_loss)
    grads, _ = tf.clip_by_global_norm([g for g, _ in grads_and_vars], 5.)
    grads_and_vars = [(g, pair[1]) for g, pair in zip(grads, grads_and_vars)]
    train_op = optimizer.apply_gradients(grads_and_vars)

    # Create TF session and callables.
    session = tf.Session()
    self._reset_fn = session.make_callable(
        network.initial_state(batch_size=1))
    self._policy_fn = session.make_callable(
        [action, next_state], [obs, mask, state])
    self._update_fn = session.make_callable(
        [train_op, final_state], placeholders + [obs, mask, state])
    session.run(tf.global_variables_initializer())

    # Initialize numpy buffers
    self.state = self._reset_fn()
    self.update_init_state = self._reset_fn()
    self.arrays = [
        np.zeros(shape=(self._sequence_length, 1) + shape, dtype=dtype)
        for shape, dtype in zip(shapes, dtypes)]
Ejemplo n.º 20
0
    def compute_adam_gradients(self, adam: tf.train.Optimizer, loss,
                               variables):
        from tensorflow.python.training.optimizer import Optimizer
        from tensorflow.python.eager import context
        from tensorflow.python.framework import dtypes
        from tensorflow.python.ops import control_flow_ops
        from tensorflow.python.ops import variable_scope
        from tensorflow.python.training import distribute as distribute_lib
        from tensorflow.python.training import distribution_strategy_context
        from tensorflow.python.util import nest

        def compute_gradients(optimizer,
                              loss,
                              var_list=None,
                              gate_gradients=Optimizer.GATE_OP,
                              aggregation_method=None,
                              colocate_gradients_with_ops=False,
                              grad_loss=None):
            if callable(loss):
                from tensorflow.python.eager import backprop
                with backprop.GradientTape() as tape:
                    if var_list is not None:
                        tape.watch(var_list)
                    loss_value = loss()

                    # Scale loss if using a "mean" loss reduction and multiple towers.
                    # Have to be careful to call distribute_lib.get_loss_reduction()
                    # *after* loss() is evaluated, so we know what loss reduction it uses.
                    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
                    if (distribute_lib.get_loss_reduction() ==
                            variable_scope.VariableAggregation.MEAN):
                        num_towers = distribution_strategy_context.get_distribution_strategy(
                        ).num_towers
                        if num_towers > 1:
                            loss_value *= (1. / num_towers)

                if var_list is None:
                    var_list = tape.watched_variables()
                # TODO(jhseu): Figure out why GradientTape's gradients don't require loss
                # to be executed.
                with ops.control_dependencies([loss_value]):
                    grads = tape.gradient(loss_value, var_list, grad_loss)
                return list(zip(grads, var_list))

            # Non-callable/Tensor loss case
            if context.executing_eagerly():
                raise RuntimeError(
                    "`loss` passed to Optimizer.compute_gradients should "
                    "be a function when eager execution is enabled.")

            # Scale loss if using a "mean" loss reduction and multiple towers.
            if (distribute_lib.get_loss_reduction() ==
                    variable_scope.VariableAggregation.MEAN):
                num_towers = distribution_strategy_context.get_distribution_strategy(
                ).num_towers
                if num_towers > 1:
                    loss *= (1. / num_towers)

            if gate_gradients not in [
                    Optimizer.GATE_NONE, Optimizer.GATE_OP,
                    Optimizer.GATE_GRAPH
            ]:
                raise ValueError(
                    "gate_gradients must be one of: Optimizer.GATE_NONE, "
                    "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
                    gate_gradients)
            optimizer._assert_valid_dtypes([loss])
            if grad_loss is not None:
                optimizer._assert_valid_dtypes([grad_loss])
            if var_list is None:
                var_list = (variables.trainable_variables() +
                            ops.get_collection(
                                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
            else:
                var_list = nest.flatten(var_list)
            # pylint: disable=protected-access
            var_list += ops.get_collection(
                ops.GraphKeys._STREAMING_MODEL_PORTS)
            # pylint: enable=protected-access
            from tensorflow.python.training.optimizer import _get_processor
            processors = [_get_processor(v) for v in var_list]
            if not var_list:
                raise ValueError("No variables to optimize.")
            var_refs = [p.target() for p in processors]
            # original gradients computation
            # grads = tf.gradients(
            #     loss, var_refs, grad_ys=grad_loss,
            #     gate_gradients=(gate_gradients == Optimizer.GATE_OP),
            #     aggregation_method=aggregation_method,
            #     colocate_gradients_with_ops=colocate_gradients_with_ops)
            # using gradient check-pointing
            from memory_saving_gradients import gradients
            # setting outputs of different networks
            tensors_to_checkpoint = self.get_tensors_to_checkpoint()

            # just specifying memory as parameter fails
            grads = gradients(
                loss,
                var_refs,
                grad_ys=grad_loss,
                gate_gradients=(gate_gradients == Optimizer.GATE_OP),
                aggregation_method=aggregation_method,
                colocate_gradients_with_ops=colocate_gradients_with_ops,
                checkpoints='speed')

            if gate_gradients == Optimizer.GATE_GRAPH:
                grads = control_flow_ops.tuple(grads)
            grads_and_vars = list(zip(grads, var_list))
            optimizer._assert_valid_dtypes([
                v for g, v in grads_and_vars
                if g is not None and v.dtype != dtypes.resource
            ])
            return grads_and_vars

        # just copied so I can change gradients
        # computed_gradients = compute_gradients(adam, loss, var_list=variables)

        computed_gradients = adam.compute_gradients(
            loss, var_list=variables)  # original gradient
        return computed_gradients