def _define_network_(self): with tf.variable_scope("eval_network", reuse=tf.AUTO_REUSE): if len(self.ob_space.shape) == 1: # use to take action with tf.variable_scope("pi", reuse=tf.AUTO_REUSE): self.eval_policy = net_utils.mlp( self.x_ph, list(self.hidden_sizes) + list(self.ac_space.shape), activation=tf.nn.relu, output_activation=tf.nn.tanh) * self.act_limit # use to train the policy w.r.t maximize self.eval_policy_q with tf.variable_scope("q", reuse=tf.AUTO_REUSE): self.eval_policy_q = net_utils.mlp( tf.concat((self.x_ph, self.eval_policy), axis=1), list(self.hidden_sizes) + [1], activation=tf.nn.relu, output_activation=None) # use to train the q_func w.r.t minimize (self.q_ph - self.eval_action_q)^2 with tf.variable_scope("q", reuse=tf.AUTO_REUSE): self.eval_action_q = net_utils.mlp( tf.concat((self.x_ph, self.a_ph), axis=1), list(self.hidden_sizes) + [1], activation=tf.nn.relu, output_activation=None) else: raise NotImplementedError with tf.variable_scope("target_network", reuse=tf.AUTO_REUSE): if len(self.ob_space.shape) == 1: # use to calc the target q_func Q(s',a=u(s')) with tf.variable_scope("pi", reuse=tf.AUTO_REUSE): self.target_policy = net_utils.mlp( self.x_ph, list(self.hidden_sizes) + list(self.ac_space.shape), activation=tf.nn.relu, output_activation=tf.nn.tanh) * self.act_limit with tf.variable_scope("q", reuse=tf.AUTO_REUSE): self.target_policy_q = net_utils.mlp( tf.concat((self.x_ph, self.target_policy), axis=1), list(self.hidden_sizes) + [1], activation=tf.nn.relu, output_activation=None) else: raise NotImplementedError self.pi_params = net_utils.get_vars("DDPG/eval_network/pi") self.q_params = net_utils.get_vars("DDPG/eval_network/q") self.eval_params = net_utils.get_vars("DDPG/eval_network") self.target_params = net_utils.get_vars("DDPG/target_network") self.target_update_op = [ tf.assign(tp, self.tau * ep + (1 - self.tau) * tp) for ep, tp in zip(self.eval_params, self.target_params) ] self.target_init_op = [ tf.assign(tp, ep) for ep, tp in zip(self.eval_params, self.target_params) ]
def build_update(self): """ Select the network variables and build the operation to copy main weights and biases to the target network. """ self.main_vars = get_vars('main_network', trainable=True) self.target_vars = get_vars('target_network', trainable=False) # Initial operation to start with target_net == main_net self.init_target_op = copy_vars(self.main_vars, self.target_vars, 1, 'init_target') self.target_update = copy_vars(self.main_vars, self.target_vars, Settings.UPDATE_TARGET_RATE, 'update_target')
def _define_network_(self): with tf.variable_scope("network",reuse=tf.AUTO_REUSE): if len(self.ob_space.shape) == 3: #use convnet self.pi, self.logp, self.logp_pi, self.pi_entropy,self.v = net_utils.cnn_actor_critic(self.x_ph,self.a_ph,self.hidden_sizes,activation=tf.nn.tanh,action_space=self.ac_space) elif len(self.ob_space.shape) == 1: #use mlpnet self.pi,self.logp,self.logp_pi,self.pi_entropy,self.v = net_utils.mlp_actor_critic(self.x_ph,self.a_ph,self.hidden_sizes,activation=tf.nn.tanh,action_space=self.ac_space) self.get_action_ops = [self.pi,self.v,self.logp_pi] self.params = net_utils.get_vars("PPO/network")
def build_update(self): """ Build the operation to copy the weights of the learner's actor network in the agent's network. """ with self.sess.as_default(), self.sess.graph.as_default(): self.network_vars = get_vars('learner_actor', trainable=True) self.update = copy_vars(self.network_vars, self.vars, 1, 'update_agent_'+str(self.n_agent))
def build_actor(self): """ Build a copy of the learner's actor network to allow the agent to interact with the environment on its own. """ scope = 'worker_agent_' + str(self.n_agent) self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None, *Settings.STATE_SIZE], name='state_ph') # Get the policy prediction network self.policy = build_actor(self.state_ph, trainable=False, scope=scope) self.vars = get_vars(scope, trainable=False)
def build_update(self): """ Select the network variables and build the operation to copy main weights and biases to the target network. """ # Isolate vars for each network self.actor_vars = get_vars('actor', trainable=True) self.critic_vars = get_vars('critic', trainable=True) self.vars = self.actor_vars + self.critic_vars self.target_actor_vars = get_vars('target_actor', trainable=False) self.target_critic_vars = get_vars('target_critic', trainable=False) self.target_vars = self.target_actor_vars + self.target_critic_vars # Initial operation to start with target_net == main_net self.init_target_op = copy_vars(self.vars, self.target_vars, 1, 'init_target') # Update values for target vars towards current actor and critic vars self.target_update = copy_vars(self.vars, self.target_vars, Settings.UPDATE_TARGET_RATE, 'target_update')
def build_update(self): """ Build the operation to copy the weights of the learner's actor network in the agent's network. """ with self.sess.as_default(), self.sess.graph.as_default(): self.network_vars = get_vars('learner_actor', trainable=True) # update agent with the newest actor values # copy_vars copies the values of network_vars to self.vars with update rate '1' self.update = copy_vars( self.network_vars, self. vars, # src, dst, update copy actor with most recent weights 1, 'update_agent_' + str(self.n_agent))