Ejemplo n.º 1
0
 def _create_network(self, model):
     observs = self._preprocess.above_task.observs
     actions = self._preprocess.above_task.actions
     # Perception.
     state = model.add_input('state', observs.shape)
     hidden = getattr(networks, self.config.network)(model, state)
     value = model.add_output(
         'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
     policy = dense(value, actions.n, tf.nn.softmax)
     model.add_output('choice',
                      tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
     # Objectives.
     action = model.add_input('action', type_=tf.int32)
     action = tf.one_hot(action, actions.n)
     return_ = model.add_input('return_')
     logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
     entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
     advantage = tf.stop_gradient(return_ - value)
     actor = advantage * logprob + self.config.regularize * entropy
     critic = self.config.scale_critic_loss * (return_ - value)**2 / 2
     # Training.
     learning_rate = model.add_option(
         'learning_rate', float(self.config.initial_learning_rate))
     model.set_optimizer(
         self.config.optimizer(learning_rate,
                               self.config.rms_decay,
                               use_locking=True))
     model.add_cost('cost', critic - actor)
Ejemplo n.º 2
0
 def _create_network(self, model):
     observs = self._preprocess.above_task.observs.shape
     actions = self._preprocess.above_task.actions.shape[0]
     # Percetion.
     state = model.add_input('state', observs)
     hidden = getattr(networks, self.config.network)(model, state)
     values = dense(hidden, actions, tf.identity)
     values = model.add_output('values', values)
     # Training.
     action = model.add_input('action', type_=tf.int32)
     action = tf.one_hot(action, actions)
     target = model.add_input('target')
     model.add_output('value', tf.reduce_max(values, 1))
     # Opimization.
     learning_rate = model.add_option('learning_rate',
                                      self.config.initial_learning_rate)
     model.set_optimizer(
         self.config.optimizer(learning_rate=learning_rate,
                               decay=self.config.rms_decay,
                               epsilon=self.config.rms_epsilon))
     model.add_cost('cost', (tf.reduce_sum(action * values, 1) - target)**2)