コード例 #1
0
    def _build_solvers(self, json_data):
        actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data
                                   ) else json_data[self.ACTOR_STEPSIZE_KEY]
        actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data
                                 ) else json_data[self.ACTOR_MOMENTUM_KEY]
        critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data
                                   ) else json_data[self.CRITIC_STEPSIZE_KEY]
        critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data
                                  ) else json_data[self.CRITIC_MOMENTUM_KEY]

        critic_vars = self._tf_vars('main/critic')
        critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize,
                                                momentum=critic_momentum)
        self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
        self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)

        self._actor_stepsize_tf = tf.get_variable(dtype=tf.float32,
                                                  name='actor_stepsize',
                                                  initializer=actor_stepsize,
                                                  trainable=False)
        self._actor_stepsize_ph = tf.get_variable(dtype=tf.float32,
                                                  name='actor_stepsize_ph',
                                                  shape=[])
        self._actor_stepsize_update_op = self._actor_stepsize_tf.assign(
            self._actor_stepsize_ph)

        actor_vars = self._tf_vars('main/actor')
        actor_opt = tf.train.MomentumOptimizer(
            learning_rate=self._actor_stepsize_tf, momentum=actor_momentum)
        self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
        self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)

        return
コード例 #2
0
    def _build_resource_tf(self):
        self.count_tf = tf.get_variable(dtype=tf.int32,
                                        name='count',
                                        initializer=np.array([self.count],
                                                             dtype=np.int32),
                                        trainable=False)
        self.mean_tf = tf.get_variable(dtype=tf.float32,
                                       name='mean',
                                       initializer=self.mean.astype(
                                           np.float32),
                                       trainable=False)
        self.std_tf = tf.get_variable(dtype=tf.float32,
                                      name='std',
                                      initializer=self.std.astype(np.float32),
                                      trainable=False)

        self.count_ph = tf.get_variable(dtype=tf.int32,
                                        name='count_ph',
                                        shape=[1])
        self.mean_ph = tf.get_variable(dtype=tf.float32,
                                       name='mean_ph',
                                       shape=self.mean.shape)
        self.std_ph = tf.get_variable(dtype=tf.float32,
                                      name='std_ph',
                                      shape=self.std.shape)

        self._update_op = tf.group(self.count_tf.assign(self.count_ph),
                                   self.mean_tf.assign(self.mean_ph),
                                   self.std_tf.assign(self.std_ph))
        return
コード例 #3
0
def recurrent_gaussian(config, action_size, observations, length, state=None):
  """Independent recurrent policy and feed forward value networks.

  The policy network outputs the mean action and the log standard deviation
  is learned as independent parameter vector. The last policy layer is
  recurrent and uses a GRU cell.

  Args:
    config: Configuration object.
    action_size: Length of the action vector.
    observations: Sequences of observations.
    length: Batch of sequence lengths.
    state: Batch of initial recurrent states.

  Returns:
    NetworkOutput tuple.
  """
  mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer(
      factor=config.init_mean_factor)
  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
  cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1])
  flat_observations = tf.reshape(observations, [
      tf.shape(observations)[0],
      tf.shape(observations)[1],
      functools.reduce(operator.mul,
                       observations.shape.as_list()[2:], 1)
  ])
  with tf.variable_scope('policy'):
    x = flat_observations
    for size in config.policy_layers[:-1]:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32)
    mean = tf.contrib.layers.fully_connected(x,
                                             action_size,
                                             tf.tanh,
                                             weights_initializer=mean_weights_initializer)
    logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer)
    logstd = tf.tile(logstd[None, None],
                     [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
  with tf.variable_scope('value'):
    x = flat_observations
    for size in config.value_layers:
      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
    value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
  mean = tf.check_numerics(mean, 'mean')
  logstd = tf.check_numerics(logstd, 'logstd')
  value = tf.check_numerics(value, 'value')
  policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd))
  # assert state.shape.as_list()[0] is not None
  return NetworkOutput(policy, mean, logstd, value, state)
コード例 #4
0
ファイル: networks.py プロジェクト: Gs-001/quad
 def __call__(self, observation, state):
   with tf.variable_scope('policy'):
     x = tf.contrib.layers.flatten(observation)
     mean = tf.contrib.layers.fully_connected(x,
                                              self._action_size,
                                              tf.tanh,
                                              weights_initializer=self._mean_weights_initializer)
     logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
     logstd = tf.tile(logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
   with tf.variable_scope('value'):
     x = tf.contrib.layers.flatten(observation)
     for size in self._value_layers:
       x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
     value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
   return (mean, logstd, value), state