def _create_network(self, model): observs = self._preprocess.above_task.observs actions = self._preprocess.above_task.actions # Perception. state = model.add_input('state', observs.shape) hidden = getattr(networks, self.config.network)(model, state) value = model.add_output( 'value', tf.squeeze(dense(hidden, 1, tf.identity), [1])) policy = dense(value, actions.n, tf.nn.softmax) model.add_output('choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1])) # Objectives. action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions.n) return_ = model.add_input('return_') logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13) entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy) advantage = tf.stop_gradient(return_ - value) actor = advantage * logprob + self.config.regularize * entropy critic = self.config.scale_critic_loss * (return_ - value)**2 / 2 # Training. learning_rate = model.add_option( 'learning_rate', float(self.config.initial_learning_rate)) model.set_optimizer( self.config.optimizer(learning_rate, self.config.rms_decay, use_locking=True)) model.add_cost('cost', critic - actor)
def _create_network(self, model): observs = self._preprocess.above_task.observs.shape actions = self._preprocess.above_task.actions.shape[0] # Percetion. state = model.add_input('state', observs) hidden = getattr(networks, self.config.network)(model, state) values = dense(hidden, actions, tf.identity) values = model.add_output('values', values) # Training. action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions) target = model.add_input('target') model.add_output('value', tf.reduce_max(values, 1)) # Opimization. learning_rate = model.add_option('learning_rate', self.config.initial_learning_rate) model.set_optimizer( self.config.optimizer(learning_rate=learning_rate, decay=self.config.rms_decay, epsilon=self.config.rms_epsilon)) model.add_cost('cost', (tf.reduce_sum(action * values, 1) - target)**2)