def _setup_actor_optimizer(self, scope): """Create the actor loss, gradient, and optimizer.""" if self.verbose >= 2: print('setting up actor optimizer') scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: actor_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) print(' actor shapes: {}'.format(actor_shapes)) print(' actor params: {}'.format(actor_nb_params)) # compute the actor loss self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0]) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name))
def _setup_actor_update(self, all_obs_ph, combined_actors, scope): """Create the actor loss and optimization process. Parameters ---------- all_obs_ph : tf.compat.v1.placeholder the placeholder for the full-state observation combined_actors : tf.Variable the output from all actors, as a function of the agent's policy parameters scope : str an outer scope term Returns ------- tf.Operation the operation that returns the loss of the actor tf.Operation the operation that updates the trainable parameters of the actor """ if self.verbose >= 2: print('setting up actor optimizer') scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: actor_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) print(' actor shapes: {}'.format(actor_shapes)) print(' actor params: {}'.format(actor_nb_params)) # Create a differentiable form of the critic. with tf.compat.v1.variable_scope("model", reuse=False): critic_with_actor_tf = [ self.make_critic(all_obs_ph, combined_actors, scope="centralized_qf_{}".format(i), reuse=True) for i in range(2) ] # compute the actor loss actor_loss = -tf.reduce_mean(critic_with_actor_tf[0]) # Add a regularization penalty. actor_loss += self._l2_loss(self.l2_penalty, scope_name) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) actor_optimizer = optimizer.minimize( loss=actor_loss, var_list=get_trainable_vars(scope_name)) return actor_loss, actor_optimizer
def test_init_conv(self): """Check the functionality of the __init__() method with conv policies. This method tests that the proper structure graph was generated. """ policy_params = self.policy_params.copy() policy_params["model_params"]["model_type"] = "conv" _ = TD3FeedForwardPolicy(**policy_params) print(sorted([var.name for var in get_trainable_vars()])) # test case 1 self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/pi/conv0/bias:0', 'model/pi/conv0/kernel:0', 'model/pi/conv1/bias:0', 'model/pi/conv1/kernel:0', 'model/pi/conv2/bias:0', 'model/pi/conv2/kernel:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/qf_0/conv0/bias:0', 'model/qf_0/conv0/kernel:0', 'model/qf_0/conv1/bias:0', 'model/qf_0/conv1/kernel:0', 'model/qf_0/conv2/bias:0', 'model/qf_0/conv2/kernel:0', 'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0', 'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0', 'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0', 'model/qf_1/conv0/bias:0', 'model/qf_1/conv0/kernel:0', 'model/qf_1/conv1/bias:0', 'model/qf_1/conv1/kernel:0', 'model/qf_1/conv2/bias:0', 'model/qf_1/conv2/kernel:0', 'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0', 'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0', 'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0', 'target/pi/conv0/bias:0', 'target/pi/conv0/kernel:0', 'target/pi/conv1/bias:0', 'target/pi/conv1/kernel:0', 'target/pi/conv2/bias:0', 'target/pi/conv2/kernel:0', 'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0', 'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0', 'target/pi/output/bias:0', 'target/pi/output/kernel:0', 'target/qf_0/conv0/bias:0', 'target/qf_0/conv0/kernel:0', 'target/qf_0/conv1/bias:0', 'target/qf_0/conv1/kernel:0', 'target/qf_0/conv2/bias:0', 'target/qf_0/conv2/kernel:0', 'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0', 'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0', 'target/qf_0/qf_output/bias:0', 'target/qf_0/qf_output/kernel:0', 'target/qf_1/conv0/bias:0', 'target/qf_1/conv0/kernel:0', 'target/qf_1/conv1/bias:0', 'target/qf_1/conv1/kernel:0', 'target/qf_1/conv2/bias:0', 'target/qf_1/conv2/kernel:0', 'target/qf_1/fc0/bias:0', 'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0', 'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0', 'target/qf_1/qf_output/kernel:0' ])
def _setup_actor_optimizer(self, scope): """Create minimization operations for policy and entropy. Creates a `tf.optimizer.minimize` operations for updating policy and entropy with gradient descent. See Section 4.2 in [1], for further information of the policy update, and Section 5 in [1] for further information of the entropy update. """ if self.verbose >= 2: print('setting up actor and alpha optimizers') scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: actor_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) print(' actor shapes: {}'.format(actor_shapes)) print(' actor params: {}'.format(actor_nb_params)) # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi) # Compute the entropy temperature loss. self.alpha_loss = -tf.reduce_mean( self.log_alpha * tf.stop_gradient(self.logp_pi + self.target_entropy)) alpha_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.alpha_optimizer = alpha_optimizer.minimize( self.alpha_loss, var_list=self.log_alpha) # Compute the policy loss self.actor_loss = tf.reduce_mean(self.alpha * self.logp_pi - min_qf_pi) # Policy train op (has to be separate from value train op, because # min_qf_pi appears in policy_loss) actor_optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = actor_optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name))
def _setup_critic_optimizer(self, critic_target, scope): """Create the critic loss, gradient, and optimizer.""" if self.verbose >= 2: print('setting up critic optimizer') # compute the target critic term with tf.compat.v1.variable_scope("loss", reuse=False): q_obs1 = tf.minimum(critic_target[0], critic_target[1]) target_q = tf.stop_gradient(self.rew_ph + (1. - self.terminals1) * self.gamma * q_obs1) tf.compat.v1.summary.scalar('critic_target', tf.reduce_mean(target_q)) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error self.critic_loss = [loss_fn(q, target_q) for q in self.critic_tf] self.critic_optimizer = [] for i, critic_loss in enumerate(self.critic_loss): scope_name = 'model/qf_{}/'.format(i) if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: critic_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] critic_nb_params = sum([ reduce(lambda x, y: x * y, shape) for shape in critic_shapes ]) print(' critic shapes: {}'.format(critic_shapes)) print(' critic params: {}'.format(critic_nb_params)) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) # create the optimizer object self.critic_optimizer.append( optimizer.minimize(loss=critic_loss, var_list=get_trainable_vars(scope_name)))
def _l2_loss(l2_penalty, scope_name): """Compute the L2 regularization penalty. Parameters ---------- l2_penalty : float L2 regularization penalty scope_name : str the scope of the trainable variables to regularize Returns ------- float the overall regularization penalty """ if l2_penalty > 0: print("regularizing policy network: L2 = {}".format(l2_penalty)) regularizer = tf.contrib.layers.l2_regularizer( scale=l2_penalty, scope="{}/l2_regularize".format(scope_name)) l2_loss = tf.contrib.layers.apply_regularization( regularizer, weights_list=get_trainable_vars(scope_name)) else: # no regularization l2_loss = 0 return l2_loss
def _setup_deterministic_optimizer(self, action, scope=None): """Create the loss and optimizer of a deterministic policy.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up optimizer') print_params_shape(scope_name, "policy") # Choose the loss function. if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error # Define the loss function. self.loss = loss_fn(action, self.policy) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) # Create the optimizer operation. self.optimizer = optimizer.minimize( loss=self.loss, var_list=get_trainable_vars(scope_name))
def test_init(self): """Check the functionality of the __init__() method. This the proper structure graph and the proper loss function was generated for the following cases: 1. stochastic policies 2. deterministic policies """ # test case 1 policy_params = self.policy_params.copy() policy_params["stochastic"] = True _ = ImitationFeedForwardPolicy(**policy_params) # test the graph expected_vars = [ '0:0', '1:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/log_std/bias:0', 'model/pi/log_std/kernel:0', 'model/pi/mean/bias:0', 'model/pi/mean/kernel:0' ] try: self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), expected_vars) except AssertionError: # Seems to ignore the first two sometimes. self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), expected_vars[2:]) # Clear the graph. tf.compat.v1.reset_default_graph() # test case 2 policy_params = self.policy_params.copy() policy_params["stochastic"] = False _ = ImitationFeedForwardPolicy(**policy_params) # test the graph self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0' ])
def test_setup_model_feedforward(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # check the policy_kwargs term policy_kwargs = FEEDFORWARD_PARAMS.copy() policy_kwargs.update(TD3_PARAMS) policy_kwargs['verbose'] = self.init_parameters['verbose'] policy_kwargs['num_envs'] = self.init_parameters['num_envs'] self.assertDictEqual(alg.policy_kwargs, policy_kwargs) with alg.graph.as_default(): expected_vars = sorted([var.name for var in get_trainable_vars()]) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( expected_vars, ['model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0', 'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0', 'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0', 'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0', 'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0', 'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0', 'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0', 'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0', 'target/pi/output/bias:0', 'target/pi/output/kernel:0', 'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0', 'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0', 'target/qf_0/qf_output/bias:0', 'target/qf_0/qf_output/kernel:0', 'target/qf_1/fc0/bias:0', 'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0', 'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0', 'target/qf_1/qf_output/kernel:0'] )
def test_init(self): """Check the functionality of the __init__() method. This the proper structure graph and the proper loss function was generated for the following cases: 1. stochastic policies 2. deterministic policies """ # test case 1 policy_params = self.policy_params.copy() policy_params["stochastic"] = True policy = ImitationFeedForwardPolicy(**policy_params) # test the graph self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/log_std/bias:0', 'model/pi/log_std/kernel:0', 'model/pi/mean/bias:0', 'model/pi/mean/kernel:0' ]) # test the loss function del policy # TODO # Clear the graph. tf.compat.v1.reset_default_graph() # test case 2 policy_params = self.policy_params.copy() policy_params["stochastic"] = False policy = ImitationFeedForwardPolicy(**policy_params) # test the graph self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0' ]) # test the loss function del policy # TODO
def _setup_connected_gradients(self): """Create the updated manager optimization with connected gradients.""" goal_dim = self.manager.ac_space.shape[0] obs_shape = self.worker.ob_space.shape[0] obs = tf.concat([ self.worker.obs1_ph[:, :obs_shape], self.manager.obs_ph[:, obs_shape:] ], axis=-1) if self.relative_goals: # The observation from the perspective of the manager can be # collected from the first goal_dim elements of the observation. We # use goal_dim in case the goal-specific observations are not the # entire observation space. obs_t = self.manager.obs_ph[:, :goal_dim] # We collect the observation of the worker in a similar fashion as # above. obs_tpi = self.worker.obs1_ph[:, :goal_dim] # Relative goal formulation as per HIRO. goal = obs_t + self.manager.action_ph - obs_tpi else: # Goal is the direct output from the manager in this case. goal = self.manager.action_ph with tf.compat.v1.variable_scope("Manager/model"): manager_with_worker_message = self.manager.make_critic( obs, goal, self.worker.message_tf, reuse=True, scope="qf_0") self.cg_loss = -tf.reduce_mean(manager_with_worker_message) kl_div_loss = 1 + self.worker.message_std - tf.square( self.worker.message_mean) - tf.exp(self.worker.message_std) kl_div_loss = -0.5 * tf.reduce_sum(kl_div_loss, 1) self.worker_message_loss = tf.reduce_mean(kl_div_loss) optimizer = tf.compat.v1.train.AdamOptimizer(self.worker.actor_lr) self.cg_optimizer = optimizer.minimize( 0.1 * self.worker_message_loss + self.cg_weights * self.cg_loss + self.worker.actor_loss, var_list=get_trainable_vars("Worker/communication/") + get_trainable_vars("Worker/model/pi"), )
def _setup_critic_optimizer(self, scope): """Create minimization operation for critic Q-function. Create a `tf.optimizer.minimize` operation for updating critic Q-function with gradient descent. See Equations (5, 6) in [1], for further information of the Q-function update rule. """ scope_name = 'model/value_fns' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up critic optimizer') for name in ['qf1', 'qf2', 'vf']: scope_i = '{}/{}'.format(scope_name, name) print_params_shape(scope_i, name) # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(self.qf1_pi, self.qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient( self.rew_ph + (1 - self.terminals1) * self.gamma * self.value_target) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error # Compute Q-Function loss qf1_loss = loss_fn(q_backup, self.qf1) qf2_loss = loss_fn(q_backup, self.qf2) # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.alpha * self.logp_pi) value_loss = loss_fn(self.value_fn, v_backup) self.critic_loss = (qf1_loss, qf2_loss, value_loss) # Combine the loss functions for the optimizer. critic_loss = qf1_loss + qf2_loss + value_loss # Critic train op critic_optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) self.critic_optimizer = critic_optimizer.minimize( critic_loss, var_list=get_trainable_vars(scope_name))
def test_init(self): """Check the functionality of the __init__() method. This method is tested for the following features: 1. The proper structure graph was generated. 2. All input placeholders are correct. """ policy_params = deepcopy(self.policy_params) policy_params['sess'] = tf.compat.v1.Session() policy = PPOFeedForwardPolicy(**policy_params) # test case 1 self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), ['model/logstd:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/vf/fc0/bias:0', 'model/vf/fc0/kernel:0', 'model/vf/fc1/bias:0', 'model/vf/fc1/kernel:0', 'model/vf/output/bias:0', 'model/vf/output/kernel:0'] ) # test case 2 self.assertEqual( tuple(v.__int__() for v in policy.rew_ph.shape), (None,)) self.assertEqual( tuple(v.__int__() for v in policy.action_ph.shape), (None, 1)) self.assertEqual( tuple(v.__int__() for v in policy.obs_ph.shape), (None, 5)) self.assertEqual( tuple(v.__int__() for v in policy.advs_ph.shape), (None,)) self.assertEqual( tuple(v.__int__() for v in policy.old_neglog_pac_ph.shape), (None,)) self.assertEqual( tuple(v.__int__() for v in policy.old_vpred_ph.shape), (None,)) # Kill the session, policy_params['sess'].close()
def _setup_target_updates(model_scope, target_scope, scope, tau, verbose): """Create the soft and initial target updates. The initial model parameters are assumed to be stored under the scope name "model", while the target policy parameters are assumed to be under the scope name "target". If an additional outer scope was provided when creating the policies, they can be passed under the `scope` parameter. Parameters ---------- model_scope : str the scope of the model parameters target_scope : str the scope of the target parameters scope : str or None the outer scope, set to None if not available tau : float target update rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug Returns ------- tf.Operation initial target updates, to match the target with the model tf.Operation soft target update operations """ if scope is not None: model_scope = scope + '/' + model_scope target_scope = scope + '/' + target_scope return get_target_updates( get_trainable_vars(model_scope), get_trainable_vars(target_scope), tau, verbose)
def test_init(self): """Check the functionality of the __init__() method. This method is tested for the following features: 1. The proper structure graph was generated. 2. All input placeholders are correct. """ policy = TD3FeedForwardPolicy(**self.policy_params) # test case 1 self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0', 'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0', 'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0', 'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0', 'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0', 'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0', 'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0', 'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0', 'target/pi/output/bias:0', 'target/pi/output/kernel:0', 'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0', 'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0', 'target/qf_0/qf_output/bias:0', 'target/qf_0/qf_output/kernel:0', 'target/qf_1/fc0/bias:0', 'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0', 'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0', 'target/qf_1/qf_output/kernel:0' ]) # test case 2 self.assertEqual(tuple(v.__int__() for v in policy.terminals1.shape), (None, 1)) self.assertEqual(tuple(v.__int__() for v in policy.rew_ph.shape), (None, 1)) self.assertEqual(tuple(v.__int__() for v in policy.action_ph.shape), (None, self.policy_params['ac_space'].shape[0])) self.assertEqual(tuple(v.__int__() for v in policy.obs_ph.shape), (None, self.policy_params['ob_space'].shape[0] + self.policy_params['co_space'].shape[0])) self.assertEqual(tuple(v.__int__() for v in policy.obs1_ph.shape), (None, self.policy_params['ob_space'].shape[0] + self.policy_params['co_space'].shape[0]))
def _setup_actor_optimizer(self, scope): """Create the actor loss, gradient, and optimizer.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape(scope_name, "actor") # compute the actor loss self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf[0]) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.actor_optimizer = optimizer.minimize( self.actor_loss, var_list=get_trainable_vars(scope_name))
def _setup_stochastic_optimizer(self, scope): """Create the loss and optimizer of a stochastic policy.""" scope_name = 'model/pi/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up optimizer') print_params_shape(scope_name, "policy") # Define the loss function. self.loss = -tf.reduce_mean(self.logp_ac) # Create an optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate) # Create the optimizer operation. self.optimizer = optimizer.minimize( loss=self.loss, var_list=get_trainable_vars(scope_name))
def _setup_connected_gradients(self): """Create the updated manager optimization with connected gradients.""" # Index relevant variables based on self.goal_indices manager_obs0 = self.crop_to_goal(self.manager.obs_ph) manager_obs1 = self.crop_to_goal(self.manager.obs1_ph) worker_obs0 = self.crop_to_goal(self.worker.obs_ph) worker_obs1 = self.crop_to_goal(self.worker.obs1_ph) if self.relative_goals: # Relative goal formulation as per HIRO. goal = manager_obs0 + self.manager.actor_tf - manager_obs1 else: # Goal is the direct output from the manager in this case. goal = self.manager.actor_tf # concatenate the output from the manager with the worker policy. obs_shape = self.worker.ob_space.shape[0] obs = tf.concat([self.worker.obs_ph[:, :obs_shape], goal], axis=-1) # create the worker policy with inputs directly from the manager with tf.compat.v1.variable_scope("Worker/model"): worker_with_manager_obs = self.worker.make_critic( obs, self.worker.action_ph, reuse=True, scope="qf_0") # create a tensorflow operation that mimics the reward function that is # used to provide feedback to the worker if self.relative_goals: reward_fn = -tf.compat.v1.losses.mean_squared_error( worker_obs0 + goal, worker_obs1) else: reward_fn = -tf.compat.v1.losses.mean_squared_error( goal, worker_obs1) # compute the worker loss with respect to the manager actions self.cg_loss = - tf.reduce_mean(worker_with_manager_obs) - reward_fn # create the optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.manager.actor_lr) self.cg_optimizer = optimizer.minimize( self.manager.actor_loss + self.cg_weights * self.cg_loss, var_list=get_trainable_vars("Manager/model/pi/"), )
def test_init_conv(self): """Check the functionality of the __init__() method with conv policies. This method tests that the proper structure graph was generated. """ policy_params = deepcopy(self.policy_params) policy_params['sess'] = tf.compat.v1.Session() policy_params["model_params"]["model_type"] = "conv" _ = PPOFeedForwardPolicy(**policy_params) self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), ['model/logstd:0', 'model/pi/conv0/bias:0', 'model/pi/conv0/kernel:0', 'model/pi/conv1/bias:0', 'model/pi/conv1/kernel:0', 'model/pi/conv2/bias:0', 'model/pi/conv2/kernel:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/vf/conv0/bias:0', 'model/vf/conv0/kernel:0', 'model/vf/conv1/bias:0', 'model/vf/conv1/kernel:0', 'model/vf/conv2/bias:0', 'model/vf/conv2/kernel:0', 'model/vf/fc0/bias:0', 'model/vf/fc0/kernel:0', 'model/vf/fc1/bias:0', 'model/vf/fc1/kernel:0', 'model/vf/output/bias:0', 'model/vf/output/kernel:0'] ) # Kill the session, policy_params['sess'].close()
def _setup_connected_gradients(self): """Create the connected gradients meta-policy optimizer.""" # Index relevant variables based on self.goal_indices meta_obs0 = self.crop_to_goal(self.policy[0].obs_ph) meta_obs1 = self.crop_to_goal(self.policy[0].obs1_ph) worker_obs0 = self.crop_to_goal(self.policy[-1].obs_ph) worker_obs1 = self.crop_to_goal(self.policy[-1].obs1_ph) if self.relative_goals: # Relative goal formulation as per HIRO. goal = meta_obs0 + self.policy[0].actor_tf - meta_obs1 else: # Goal is the direct output from the meta policy in this case. goal = self.policy[0].actor_tf # Concatenate the output from the manager with the worker policy. obs_shape = self.policy[-1].ob_space.shape[0] obs = tf.concat([self.policy[-1].obs_ph[:, :obs_shape], goal], axis=-1) # Create the worker policy with inputs directly from the manager. with tf.compat.v1.variable_scope("level_1/model"): worker_with_meta_obs = self.policy[-1].make_critic( obs, self.policy[-1].action_ph, reuse=True, scope="qf_0") # Create a tensorflow operation that mimics the reward function that is # used to provide feedback to the worker. if self.intrinsic_reward_type.startswith("scaled"): # Scale the observations/goals by the action space of the upper- # level policy if requested. ac_space = self.policy[0].ac_space scale = 0.5 * (ac_space.high - ac_space.low) worker_obs0 /= scale goal /= scale worker_obs1 /= scale if self.relative_goals: # Implement relative goals if requested. goal += worker_obs0 if self.intrinsic_reward_type.endswith("exp_negative_distance"): reward_fn = tf.reduce_mean( tf.exp(-tf.reduce_sum( tf.square(worker_obs0 + goal - worker_obs1), axis=1))) elif self.intrinsic_reward_type.endswith("negative_distance"): reward_fn = -tf.compat.v1.losses.mean_squared_error( worker_obs0 + goal, worker_obs1) else: raise ValueError("Unknown intrinsic reward type: {}".format( self.intrinsic_reward_type)) # Scale by the worker reward scale. reward_fn *= self.intrinsic_reward_scale # Compute the worker loss with respect to the meta policy actions. self.cg_loss = -tf.reduce_mean(worker_with_meta_obs) - reward_fn # Create the optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer(self.policy[0].actor_lr) self.cg_optimizer = optimizer.minimize( self.policy[0].actor_loss + self.cg_weights * self.cg_loss, var_list=get_trainable_vars("level_0/model/pi/"), )
def initialize(self): """See parent class. This method performs the following operations: - It calls the initialization methods of the policies at every level of the hierarchy to match the target value function parameters with the current policy parameters. - It also imports the lower-level policies from a pretrained checkpoint if a path to one is specified. """ # Initialize the separate policies in the hierarchy. for i in range(self.num_levels): self.policy[i].initialize() if self.pretrain_path is not None: ckpt_path = os.path.join(self.pretrain_path, "checkpoints") # Get the checkpoint number. if self.pretrain_ckpt is None: filenames = os.listdir(ckpt_path) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = [int(f.split("-")[-1]) for f in metafiles] ckpt_num = max(metanum) else: ckpt_num = self.pretrain_ckpt # Extract the checkpoint path. ckpt_path = os.path.join(ckpt_path, "itr-{}".format(ckpt_num)) var_list = tf.train.list_variables(ckpt_path) ckpt_reader = tf.train.load_checkpoint(ckpt_path) # Check that the number of levels match. assert var_list[-1][0].startswith( "level_{}".format(self.num_levels-1)), \ "Number of levels between the checkpoint and current policy " \ "do not match. Policy={}, Checkpoint={}".format( self.num_levels, int(var_list[-1][0].split("/")[0][6:]) + 1) # Check that the names and shapes of the lowest-level policy # parameters match the current policy. current_vars = { v.name: v.shape.as_list() for v in get_trainable_vars() } for var in var_list: var_name, var_shape = var var_name = "{}:0".format(var_name) # We only check the lower-level policies. if any( var_name.startswith("level_{}".format(level)) for level in range(1, self.num_levels)): assert var_name in current_vars.keys(), \ "{} not available in current policy.".format(var_name) current_shape = current_vars[var_name] assert current_shape == var_shape, \ "Shape mismatch for {}, {} != {}".format( var_name, var_shape, current_shape) # Import the lower-level policy parameters. current_vars = {v.name: v for v in get_trainable_vars()} for var in var_list: var_name, var_shape = var if any( var_name.startswith("level_{}".format(level)) for level in range(1, self.num_levels)): value = ckpt_reader.get_tensor(var_name) var_name = "{}:0".format(var_name) self.sess.run( tf.compat.v1.assign(current_vars[var_name], value))
def _setup_cooperative_gradients(self): """Create the cooperative gradients meta-policy optimizer.""" self._n_train_steps = 0 if self.cg_delta is not None: # placeholder for the lambda term. self.cg_weights = [ tf.compat.v1.Variable(initial_value=-4.20, trainable=True) for _ in range(self.num_levels - 1) ] else: self.cg_weights = [ self.cg_weights for _ in range(self.num_levels - 1) ] self.cg_loss = [] self.cg_optimizer = [] for level in range(self.num_levels - 1): # Index relevant variables based on self.goal_indices meta_obs0 = self.crop_to_goal(self.policy[level].obs_ph) meta_obs1 = self.crop_to_goal(self.policy[level].obs1_ph) worker_obs0 = self.crop_to_goal(self.policy[level + 1].obs_ph) worker_obs1 = self.crop_to_goal(self.policy[level + 1].obs1_ph) if self.relative_goals: # Relative goal formulation as per HIRO. goal = meta_obs0 + self.policy[level].actor_tf - meta_obs1 else: # Goal is the direct output from the meta policy in this case. goal = self.policy[level].actor_tf # Concatenate the output from the manager with the worker policy. obs_shape = self.policy[level + 1].ob_space.shape[0] obs = tf.concat( [self.policy[level + 1].obs_ph[:, :obs_shape], goal], axis=-1) # Create the worker policy with inputs directly from the manager. with tf.compat.v1.variable_scope("level_{}/model".format(level + 1)): worker_with_meta_obs = self.policy[level + 1].make_critic( obs, self.policy[level + 1].action_ph, reuse=True, scope="qf_0") # Create a tensorflow operation that mimics the reward function # that is used to provide feedback to the worker. if self.intrinsic_reward_type.startswith("scaled"): # Scale the observations/goals by the action space of the # upper-level policy if requested. ac_space = self.policy[level].ac_space scale = 0.5 * (ac_space.high - ac_space.low) worker_obs0 /= scale goal /= scale worker_obs1 /= scale if self.relative_goals: # Implement relative goals if requested. goal += worker_obs0 if self.intrinsic_reward_type.endswith("exp_negative_distance"): reward_fn = tf.reduce_mean( tf.exp(-tf.reduce_sum( tf.square(worker_obs0 + goal - worker_obs1), axis=1))) elif self.intrinsic_reward_type.endswith("negative_distance"): reward_fn = -tf.compat.v1.losses.mean_squared_error( worker_obs0 + goal, worker_obs1) else: raise ValueError("Unknown intrinsic reward type: {}".format( self.intrinsic_reward_type)) # Scale by the worker reward scale. reward_fn *= self.intrinsic_reward_scale # Compute the worker loss with respect to the meta policy actions. cg_loss = -(tf.reduce_mean(worker_with_meta_obs) + reward_fn) self.cg_loss.append(cg_loss) # Create the optimizer object. optimizer = tf.compat.v1.train.AdamOptimizer( self.policy[level].actor_lr) self.cg_optimizer.append( optimizer.minimize( self.policy[level].actor_loss + tf.exp(self.cg_weights[level]) * cg_loss, var_list=get_trainable_vars( "level_{}/model/pi/".format(level)), )) if self.cg_delta is not None: cg_weights_loss = \ tf.reduce_mean( tf.exp(self.cg_weights[level]) * tf.stop_gradient( worker_with_meta_obs + reward_fn - self.cg_delta ) ) optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr) self.cg_weights_optimizer = optimizer.minimize( cg_weights_loss, var_list=[self.cg_weights[level]]) # Add to tensorboard. tf.compat.v1.summary.scalar( "level_{}/cg_weights_log".format(level), self.cg_weights[level]) tf.compat.v1.summary.scalar( "level_{}/cg_weights".format(level), tf.exp(self.cg_weights[level])) tf.compat.v1.summary.scalar( "level_{}/cg_weights_loss".format(level), cg_weights_loss) tf.compat.v1.summary.scalar( "level_{}/worker_with_meta_obs".format(level), tf.reduce_mean(worker_with_meta_obs))
def test_setup_model_goal_conditioned(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # check the policy_kwargs term policy_kwargs = GOAL_CONDITIONED_PARAMS.copy() policy_kwargs.update(TD3_PARAMS) policy_kwargs['verbose'] = self.init_parameters['verbose'] policy_kwargs['env_name'] = self.init_parameters['env'] policy_kwargs['num_envs'] = self.init_parameters['num_envs'] self.assertDictEqual(alg.policy_kwargs, policy_kwargs) with alg.graph.as_default(): expected_vars = sorted([var.name for var in get_trainable_vars()]) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( expected_vars, ['level_0/model/pi/fc0/bias:0', 'level_0/model/pi/fc0/kernel:0', 'level_0/model/pi/fc1/bias:0', 'level_0/model/pi/fc1/kernel:0', 'level_0/model/pi/output/bias:0', 'level_0/model/pi/output/kernel:0', 'level_0/model/qf_0/fc0/bias:0', 'level_0/model/qf_0/fc0/kernel:0', 'level_0/model/qf_0/fc1/bias:0', 'level_0/model/qf_0/fc1/kernel:0', 'level_0/model/qf_0/qf_output/bias:0', 'level_0/model/qf_0/qf_output/kernel:0', 'level_0/model/qf_1/fc0/bias:0', 'level_0/model/qf_1/fc0/kernel:0', 'level_0/model/qf_1/fc1/bias:0', 'level_0/model/qf_1/fc1/kernel:0', 'level_0/model/qf_1/qf_output/bias:0', 'level_0/model/qf_1/qf_output/kernel:0', 'level_0/target/pi/fc0/bias:0', 'level_0/target/pi/fc0/kernel:0', 'level_0/target/pi/fc1/bias:0', 'level_0/target/pi/fc1/kernel:0', 'level_0/target/pi/output/bias:0', 'level_0/target/pi/output/kernel:0', 'level_0/target/qf_0/fc0/bias:0', 'level_0/target/qf_0/fc0/kernel:0', 'level_0/target/qf_0/fc1/bias:0', 'level_0/target/qf_0/fc1/kernel:0', 'level_0/target/qf_0/qf_output/bias:0', 'level_0/target/qf_0/qf_output/kernel:0', 'level_0/target/qf_1/fc0/bias:0', 'level_0/target/qf_1/fc0/kernel:0', 'level_0/target/qf_1/fc1/bias:0', 'level_0/target/qf_1/fc1/kernel:0', 'level_0/target/qf_1/qf_output/bias:0', 'level_0/target/qf_1/qf_output/kernel:0', 'level_1/model/pi/fc0/bias:0', 'level_1/model/pi/fc0/kernel:0', 'level_1/model/pi/fc1/bias:0', 'level_1/model/pi/fc1/kernel:0', 'level_1/model/pi/output/bias:0', 'level_1/model/pi/output/kernel:0', 'level_1/model/qf_0/fc0/bias:0', 'level_1/model/qf_0/fc0/kernel:0', 'level_1/model/qf_0/fc1/bias:0', 'level_1/model/qf_0/fc1/kernel:0', 'level_1/model/qf_0/qf_output/bias:0', 'level_1/model/qf_0/qf_output/kernel:0', 'level_1/model/qf_1/fc0/bias:0', 'level_1/model/qf_1/fc0/kernel:0', 'level_1/model/qf_1/fc1/bias:0', 'level_1/model/qf_1/fc1/kernel:0', 'level_1/model/qf_1/qf_output/bias:0', 'level_1/model/qf_1/qf_output/kernel:0', 'level_1/target/pi/fc0/bias:0', 'level_1/target/pi/fc0/kernel:0', 'level_1/target/pi/fc1/bias:0', 'level_1/target/pi/fc1/kernel:0', 'level_1/target/pi/output/bias:0', 'level_1/target/pi/output/kernel:0', 'level_1/target/qf_0/fc0/bias:0', 'level_1/target/qf_0/fc0/kernel:0', 'level_1/target/qf_0/fc1/bias:0', 'level_1/target/qf_0/fc1/kernel:0', 'level_1/target/qf_0/qf_output/bias:0', 'level_1/target/qf_0/qf_output/kernel:0', 'level_1/target/qf_1/fc0/bias:0', 'level_1/target/qf_1/fc0/kernel:0', 'level_1/target/qf_1/fc1/bias:0', 'level_1/target/qf_1/fc1/kernel:0', 'level_1/target/qf_1/qf_output/bias:0', 'level_1/target/qf_1/qf_output/kernel:0'] )
def _setup_critic_update(self, critic, all_obs1_ph, actor_target, rew_ph, done1, scope): """Create the critic loss and optimization process. Parameters ---------- critic : tf.Variable the output from the centralized critic of the agent all_obs1_ph : tf.compat.v1.placeholder the placeholder for the full-state observation actor_target : tf.Variable the output from the combined target actors of all agents rew_ph : tf.compat.v1.placeholder placeholder for the rewards of the agent done1 : tf.compat.v1.placeholder placeholder for the done mask of the agent scope : str an outer scope term Returns ------- tf.Operation the operation that returns the loss of the critic tf.Operation the operation that updates the trainable parameters of the critic """ if self.verbose >= 2: print('setting up critic optimizer') # Create the centralized target critic policy. with tf.compat.v1.variable_scope("target", reuse=False): critic_target = [ self.make_critic(all_obs1_ph, actor_target, scope="centralized_qf_{}".format(i)) for i in range(2) ] # compute the target critic term with tf.compat.v1.variable_scope("loss", reuse=False): q_obs1 = tf.minimum(critic_target[0], critic_target[1]) target_q = tf.stop_gradient(rew_ph + (1. - done1) * self.gamma * q_obs1) tf.compat.v1.summary.scalar('critic_target', tf.reduce_mean(target_q)) # choose the loss function if self.use_huber: loss_fn = tf.compat.v1.losses.huber_loss else: loss_fn = tf.compat.v1.losses.mean_squared_error critic_loss = [loss_fn(q, target_q) for q in critic] critic_optimizer = [] for i, loss in enumerate(critic_loss): scope_name = 'model/centralized_qf_{}'.format(i) if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: critic_shapes = [ var.get_shape().as_list() for var in get_trainable_vars(scope_name) ] critic_nb_params = sum([ reduce(lambda x, y: x * y, shape) for shape in critic_shapes ]) print(' critic shapes: {}'.format(critic_shapes)) print(' critic params: {}'.format(critic_nb_params)) # create an optimizer object optimizer = tf.compat.v1.train.AdamOptimizer(self.critic_lr) # create the optimizer object critic_optimizer.append( optimizer.minimize(loss=loss, var_list=get_trainable_vars(scope_name))) return critic_loss, critic_optimizer
def _setup_optimizers(self, scope): """Create the actor and critic optimizers.""" scope_name = 'model/' if scope is not None: scope_name = scope + '/' + scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape("{}pi/".format(scope_name), "actor") print('setting up critic optimizer') print_params_shape("{}vf/".format(scope_name), "critic") neglogpac = self._neglogp(self.action_ph) self.entropy = tf.reduce_sum(tf.reshape(self.pi_logstd, [-1]) + .5 * np.log(2.0 * np.pi * np.e), axis=-1) # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.cliprange_vf = self.cliprange if self.cliprange_vf < 0: # Original PPO implementation: no value function clipping. vpred_clipped = self.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + tf.clip_by_value( self.value_flat - self.old_vpred_ph, -self.cliprange_vf, self.cliprange_vf) vf_losses1 = tf.square(self.value_flat - self.rew_ph) vf_losses2 = tf.square(vpred_clipped - self.rew_ph) self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.cast(tf.greater(tf.abs(ratio - 1.0), self.cliprange), tf.float32)) self.loss = self.pg_loss - self.entropy * self.ent_coef \ + self.vf_loss * self.vf_coef # Compute the gradients of the loss. var_list = get_trainable_vars(scope_name) grads = tf.gradients(self.loss, var_list) # Perform gradient clipping if requested. if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, var_list)) # Create the operation that applies the gradients. self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, epsilon=1e-5).apply_gradients(grads)
def test_init_conv(self): """Check the functionality of the __init__() method with conv policies. This method tests that the proper structure graph was generated. """ policy_params = self.policy_params.copy() policy_params["model_params"]["model_type"] = "conv" _ = SACFeedForwardPolicy(**policy_params) self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/log_alpha:0', 'model/pi/conv0/bias:0', 'model/pi/conv0/kernel:0', 'model/pi/conv1/bias:0', 'model/pi/conv1/kernel:0', 'model/pi/conv2/bias:0', 'model/pi/conv2/kernel:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/log_std/bias:0', 'model/pi/log_std/kernel:0', 'model/pi/mean/bias:0', 'model/pi/mean/kernel:0', 'model/value_fns/qf1/conv0/bias:0', 'model/value_fns/qf1/conv0/kernel:0', 'model/value_fns/qf1/conv1/bias:0', 'model/value_fns/qf1/conv1/kernel:0', 'model/value_fns/qf1/conv2/bias:0', 'model/value_fns/qf1/conv2/kernel:0', 'model/value_fns/qf1/fc0/bias:0', 'model/value_fns/qf1/fc0/kernel:0', 'model/value_fns/qf1/fc1/bias:0', 'model/value_fns/qf1/fc1/kernel:0', 'model/value_fns/qf1/qf_output/bias:0', 'model/value_fns/qf1/qf_output/kernel:0', 'model/value_fns/qf2/conv0/bias:0', 'model/value_fns/qf2/conv0/kernel:0', 'model/value_fns/qf2/conv1/bias:0', 'model/value_fns/qf2/conv1/kernel:0', 'model/value_fns/qf2/conv2/bias:0', 'model/value_fns/qf2/conv2/kernel:0', 'model/value_fns/qf2/fc0/bias:0', 'model/value_fns/qf2/fc0/kernel:0', 'model/value_fns/qf2/fc1/bias:0', 'model/value_fns/qf2/fc1/kernel:0', 'model/value_fns/qf2/qf_output/bias:0', 'model/value_fns/qf2/qf_output/kernel:0', 'model/value_fns/vf/conv0/bias:0', 'model/value_fns/vf/conv0/kernel:0', 'model/value_fns/vf/conv1/bias:0', 'model/value_fns/vf/conv1/kernel:0', 'model/value_fns/vf/conv2/bias:0', 'model/value_fns/vf/conv2/kernel:0', 'model/value_fns/vf/fc0/bias:0', 'model/value_fns/vf/fc0/kernel:0', 'model/value_fns/vf/fc1/bias:0', 'model/value_fns/vf/fc1/kernel:0', 'model/value_fns/vf/vf_output/bias:0', 'model/value_fns/vf/vf_output/kernel:0', 'target/value_fns/vf/conv0/bias:0', 'target/value_fns/vf/conv0/kernel:0', 'target/value_fns/vf/conv1/bias:0', 'target/value_fns/vf/conv1/kernel:0', 'target/value_fns/vf/conv2/bias:0', 'target/value_fns/vf/conv2/kernel:0', 'target/value_fns/vf/fc0/bias:0', 'target/value_fns/vf/fc0/kernel:0', 'target/value_fns/vf/fc1/bias:0', 'target/value_fns/vf/fc1/kernel:0', 'target/value_fns/vf/vf_output/bias:0', 'target/value_fns/vf/vf_output/kernel:0' ])
def test_init(self): """Check the functionality of the __init__() method. This method is tested for the following features: 1. The proper structure graph was generated. 2. All input placeholders are correct. 3. self.log_alpha is initialized to zero 4. self.target_entropy is initialized as specified, with the special (None) case as well """ policy = SACFeedForwardPolicy(**self.policy_params) # test case 1 self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'model/log_alpha:0', 'model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/log_std/bias:0', 'model/pi/log_std/kernel:0', 'model/pi/mean/bias:0', 'model/pi/mean/kernel:0', 'model/value_fns/qf1/fc0/bias:0', 'model/value_fns/qf1/fc0/kernel:0', 'model/value_fns/qf1/fc1/bias:0', 'model/value_fns/qf1/fc1/kernel:0', 'model/value_fns/qf1/qf_output/bias:0', 'model/value_fns/qf1/qf_output/kernel:0', 'model/value_fns/qf2/fc0/bias:0', 'model/value_fns/qf2/fc0/kernel:0', 'model/value_fns/qf2/fc1/bias:0', 'model/value_fns/qf2/fc1/kernel:0', 'model/value_fns/qf2/qf_output/bias:0', 'model/value_fns/qf2/qf_output/kernel:0', 'model/value_fns/vf/fc0/bias:0', 'model/value_fns/vf/fc0/kernel:0', 'model/value_fns/vf/fc1/bias:0', 'model/value_fns/vf/fc1/kernel:0', 'model/value_fns/vf/vf_output/bias:0', 'model/value_fns/vf/vf_output/kernel:0', 'target/value_fns/vf/fc0/bias:0', 'target/value_fns/vf/fc0/kernel:0', 'target/value_fns/vf/fc1/bias:0', 'target/value_fns/vf/fc1/kernel:0', 'target/value_fns/vf/vf_output/bias:0', 'target/value_fns/vf/vf_output/kernel:0' ]) # test case 2 self.assertEqual(tuple(v.__int__() for v in policy.terminals1.shape), (None, 1)) self.assertEqual(tuple(v.__int__() for v in policy.rew_ph.shape), (None, 1)) self.assertEqual(tuple(v.__int__() for v in policy.action_ph.shape), (None, self.policy_params['ac_space'].shape[0])) self.assertEqual(tuple(v.__int__() for v in policy.obs_ph.shape), (None, self.policy_params['ob_space'].shape[0] + self.policy_params['co_space'].shape[0])) self.assertEqual(tuple(v.__int__() for v in policy.obs1_ph.shape), (None, self.policy_params['ob_space'].shape[0] + self.policy_params['co_space'].shape[0])) # Initialize the variables of the policy. policy.sess.run(tf.compat.v1.global_variables_initializer()) # test case 3 self.assertEqual(policy.sess.run(policy.log_alpha), 0.0) # test case 4a self.assertEqual(policy.target_entropy, -self.policy_params['ac_space'].shape[0]) # Clear the graph. tf.compat.v1.reset_default_graph() # test case 4b self.policy_params['target_entropy'] = 5 policy = SACFeedForwardPolicy(**self.policy_params) self.assertEqual(policy.target_entropy, self.policy_params['target_entropy'])
def test_init(self): """Validate that the graph and variables are initialized properly.""" policy = SACGoalConditionedPolicy(**self.policy_params) # Check that the abstract class has all the required attributes. self.assertEqual(policy.meta_period, self.policy_params['meta_period']) self.assertEqual(policy.relative_goals, self.policy_params['relative_goals']) self.assertEqual(policy.off_policy_corrections, self.policy_params['off_policy_corrections']) self.assertEqual(policy.use_fingerprints, self.policy_params['use_fingerprints']) self.assertEqual(policy.centralized_value_functions, self.policy_params['centralized_value_functions']) self.assertEqual(policy.connected_gradients, self.policy_params['connected_gradients']) self.assertEqual(policy.cg_weights, self.policy_params['cg_weights']) self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'Manager/model/log_alpha:0', 'Manager/model/pi/fc0/bias:0', 'Manager/model/pi/fc0/kernel:0', 'Manager/model/pi/fc1/bias:0', 'Manager/model/pi/fc1/kernel:0', 'Manager/model/pi/log_std/bias:0', 'Manager/model/pi/log_std/kernel:0', 'Manager/model/pi/mean/bias:0', 'Manager/model/pi/mean/kernel:0', 'Manager/model/value_fns/qf1/fc0/bias:0', 'Manager/model/value_fns/qf1/fc0/kernel:0', 'Manager/model/value_fns/qf1/fc1/bias:0', 'Manager/model/value_fns/qf1/fc1/kernel:0', 'Manager/model/value_fns/qf1/qf_output/bias:0', 'Manager/model/value_fns/qf1/qf_output/kernel:0', 'Manager/model/value_fns/qf2/fc0/bias:0', 'Manager/model/value_fns/qf2/fc0/kernel:0', 'Manager/model/value_fns/qf2/fc1/bias:0', 'Manager/model/value_fns/qf2/fc1/kernel:0', 'Manager/model/value_fns/qf2/qf_output/bias:0', 'Manager/model/value_fns/qf2/qf_output/kernel:0', 'Manager/model/value_fns/vf/fc0/bias:0', 'Manager/model/value_fns/vf/fc0/kernel:0', 'Manager/model/value_fns/vf/fc1/bias:0', 'Manager/model/value_fns/vf/fc1/kernel:0', 'Manager/model/value_fns/vf/vf_output/bias:0', 'Manager/model/value_fns/vf/vf_output/kernel:0', 'Manager/target/value_fns/vf/fc0/bias:0', 'Manager/target/value_fns/vf/fc0/kernel:0', 'Manager/target/value_fns/vf/fc1/bias:0', 'Manager/target/value_fns/vf/fc1/kernel:0', 'Manager/target/value_fns/vf/vf_output/bias:0', 'Manager/target/value_fns/vf/vf_output/kernel:0', 'Worker/model/log_alpha:0', 'Worker/model/pi/fc0/bias:0', 'Worker/model/pi/fc0/kernel:0', 'Worker/model/pi/fc1/bias:0', 'Worker/model/pi/fc1/kernel:0', 'Worker/model/pi/log_std/bias:0', 'Worker/model/pi/log_std/kernel:0', 'Worker/model/pi/mean/bias:0', 'Worker/model/pi/mean/kernel:0', 'Worker/model/value_fns/qf1/fc0/bias:0', 'Worker/model/value_fns/qf1/fc0/kernel:0', 'Worker/model/value_fns/qf1/fc1/bias:0', 'Worker/model/value_fns/qf1/fc1/kernel:0', 'Worker/model/value_fns/qf1/qf_output/bias:0', 'Worker/model/value_fns/qf1/qf_output/kernel:0', 'Worker/model/value_fns/qf2/fc0/bias:0', 'Worker/model/value_fns/qf2/fc0/kernel:0', 'Worker/model/value_fns/qf2/fc1/bias:0', 'Worker/model/value_fns/qf2/fc1/kernel:0', 'Worker/model/value_fns/qf2/qf_output/bias:0', 'Worker/model/value_fns/qf2/qf_output/kernel:0', 'Worker/model/value_fns/vf/fc0/bias:0', 'Worker/model/value_fns/vf/fc0/kernel:0', 'Worker/model/value_fns/vf/fc1/bias:0', 'Worker/model/value_fns/vf/fc1/kernel:0', 'Worker/model/value_fns/vf/vf_output/bias:0', 'Worker/model/value_fns/vf/vf_output/kernel:0', 'Worker/target/value_fns/vf/fc0/bias:0', 'Worker/target/value_fns/vf/fc0/kernel:0', 'Worker/target/value_fns/vf/fc1/bias:0', 'Worker/target/value_fns/vf/fc1/kernel:0', 'Worker/target/value_fns/vf/vf_output/bias:0', 'Worker/target/value_fns/vf/vf_output/kernel:0', ])
def test_init(self): """Validate that the graph and variables are initialized properly.""" policy = TD3GoalConditionedPolicy(**self.policy_params) # Check that the abstract class has all the required attributes. self.assertEqual(policy.meta_period, self.policy_params['meta_period']) self.assertEqual(policy.relative_goals, self.policy_params['relative_goals']) self.assertEqual(policy.off_policy_corrections, self.policy_params['off_policy_corrections']) self.assertEqual(policy.use_fingerprints, self.policy_params['use_fingerprints']) self.assertEqual(policy.centralized_value_functions, self.policy_params['centralized_value_functions']) self.assertEqual(policy.connected_gradients, self.policy_params['connected_gradients']) self.assertEqual(policy.cg_weights, self.policy_params['cg_weights']) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), [ 'Manager/model/pi/fc0/bias:0', 'Manager/model/pi/fc0/kernel:0', 'Manager/model/pi/fc1/bias:0', 'Manager/model/pi/fc1/kernel:0', 'Manager/model/pi/output/bias:0', 'Manager/model/pi/output/kernel:0', 'Manager/model/qf_0/fc0/bias:0', 'Manager/model/qf_0/fc0/kernel:0', 'Manager/model/qf_0/fc1/bias:0', 'Manager/model/qf_0/fc1/kernel:0', 'Manager/model/qf_0/qf_output/bias:0', 'Manager/model/qf_0/qf_output/kernel:0', 'Manager/model/qf_1/fc0/bias:0', 'Manager/model/qf_1/fc0/kernel:0', 'Manager/model/qf_1/fc1/bias:0', 'Manager/model/qf_1/fc1/kernel:0', 'Manager/model/qf_1/qf_output/bias:0', 'Manager/model/qf_1/qf_output/kernel:0', 'Manager/target/pi/fc0/bias:0', 'Manager/target/pi/fc0/kernel:0', 'Manager/target/pi/fc1/bias:0', 'Manager/target/pi/fc1/kernel:0', 'Manager/target/pi/output/bias:0', 'Manager/target/pi/output/kernel:0', 'Manager/target/qf_0/fc0/bias:0', 'Manager/target/qf_0/fc0/kernel:0', 'Manager/target/qf_0/fc1/bias:0', 'Manager/target/qf_0/fc1/kernel:0', 'Manager/target/qf_0/qf_output/bias:0', 'Manager/target/qf_0/qf_output/kernel:0', 'Manager/target/qf_1/fc0/bias:0', 'Manager/target/qf_1/fc0/kernel:0', 'Manager/target/qf_1/fc1/bias:0', 'Manager/target/qf_1/fc1/kernel:0', 'Manager/target/qf_1/qf_output/bias:0', 'Manager/target/qf_1/qf_output/kernel:0', 'Worker/model/pi/fc0/bias:0', 'Worker/model/pi/fc0/kernel:0', 'Worker/model/pi/fc1/bias:0', 'Worker/model/pi/fc1/kernel:0', 'Worker/model/pi/output/bias:0', 'Worker/model/pi/output/kernel:0', 'Worker/model/qf_0/fc0/bias:0', 'Worker/model/qf_0/fc0/kernel:0', 'Worker/model/qf_0/fc1/bias:0', 'Worker/model/qf_0/fc1/kernel:0', 'Worker/model/qf_0/qf_output/bias:0', 'Worker/model/qf_0/qf_output/kernel:0', 'Worker/model/qf_1/fc0/bias:0', 'Worker/model/qf_1/fc0/kernel:0', 'Worker/model/qf_1/fc1/bias:0', 'Worker/model/qf_1/fc1/kernel:0', 'Worker/model/qf_1/qf_output/bias:0', 'Worker/model/qf_1/qf_output/kernel:0', 'Worker/target/pi/fc0/bias:0', 'Worker/target/pi/fc0/kernel:0', 'Worker/target/pi/fc1/bias:0', 'Worker/target/pi/fc1/kernel:0', 'Worker/target/pi/output/bias:0', 'Worker/target/pi/output/kernel:0', 'Worker/target/qf_0/fc0/bias:0', 'Worker/target/qf_0/fc0/kernel:0', 'Worker/target/qf_0/fc1/bias:0', 'Worker/target/qf_0/fc1/kernel:0', 'Worker/target/qf_0/qf_output/bias:0', 'Worker/target/qf_0/qf_output/kernel:0', 'Worker/target/qf_1/fc0/bias:0', 'Worker/target/qf_1/fc0/kernel:0', 'Worker/target/qf_1/fc1/bias:0', 'Worker/target/qf_1/fc1/kernel:0', 'Worker/target/qf_1/qf_output/bias:0', 'Worker/target/qf_1/qf_output/kernel:0' ])
def _setup_optimizers(self, scope): """Create the actor and critic optimizers.""" scope_name = 'model/' old_scope_name = "oldpi/" if scope is not None: scope_name = scope + '/' + scope_name old_scope_name = scope + '/' + old_scope_name if self.verbose >= 2: print('setting up actor optimizer') print_params_shape("{}pi/".format(scope_name), "actor") print('setting up critic optimizer') print_params_shape("{}vf/".format(scope_name), "critic") # =================================================================== # # Create the policy loss and optimizers. # # =================================================================== # with tf.variable_scope("loss", reuse=False): # Compute the KL divergence. kloldnew = tf.reduce_sum( self.pi_logstd - self.old_pi_logstd + (tf.square(self.old_pi_std) + tf.square(self.old_pi_mean - self.pi_mean)) / (2.0 * tf.square(self.pi_std)) - 0.5, axis=-1) meankl = tf.reduce_mean(kloldnew) # Compute the entropy bonus. entropy = tf.reduce_sum(self.pi_logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) meanent = tf.reduce_mean(entropy) entbonus = self.ent_coef * meanent # advantage * pnew / pold ratio = tf.exp( self.logp(self.action_ph, old=False) - self.logp(self.action_ph, old=True)) surrgain = tf.reduce_mean(ratio * self.advs_ph) optimgain = surrgain + entbonus self.losses = [optimgain, meankl, entbonus, surrgain, meanent] all_var_list = get_trainable_vars(scope_name) var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = GetFlat(var_list, sess=self.sess) self.set_from_flat = SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(meankl, var_list) shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = int(np.prod(shape)) tangents.append( tf.reshape(self.flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zip(klgrads, tangents) ]) # Fisher vector products self.fvp = flatgrad(gvp, var_list) # =================================================================== # # Update the old model to match the new one. # # =================================================================== # self.assign_old_eq_new = tf.group(*[ tf.assign(oldv, newv) for (oldv, newv) in zip( get_globals_vars(old_scope_name), get_globals_vars(scope_name)) ]) # =================================================================== # # Create the value function optimizer. # # =================================================================== # vferr = tf.reduce_mean(tf.square(self.value_flat - self.ret_ph)) optimizer = tf.compat.v1.train.AdamOptimizer(self.vf_stepsize) self.vf_optimizer = optimizer.minimize( vferr, var_list=vf_var_list, ) # Initialize the model parameters and optimizers. with self.sess.as_default(): self.sess.run(tf.compat.v1.global_variables_initializer()) th_init = self.get_flat() self.set_from_flat(th_init) self.grad = flatgrad(optimgain, var_list)