Example #1
0
    def _define_network_(self):
        with tf.variable_scope("eval_network", reuse=tf.AUTO_REUSE):
            if len(self.ob_space.shape) == 1:
                # use to take action
                with tf.variable_scope("pi", reuse=tf.AUTO_REUSE):
                    self.eval_policy = net_utils.mlp(
                        self.x_ph,
                        list(self.hidden_sizes) + list(self.ac_space.shape),
                        activation=tf.nn.relu,
                        output_activation=tf.nn.tanh) * self.act_limit
                # use to train the policy w.r.t maximize self.eval_policy_q
                with tf.variable_scope("q", reuse=tf.AUTO_REUSE):
                    self.eval_policy_q = net_utils.mlp(
                        tf.concat((self.x_ph, self.eval_policy), axis=1),
                        list(self.hidden_sizes) + [1],
                        activation=tf.nn.relu,
                        output_activation=None)
                # use to train the q_func w.r.t minimize (self.q_ph - self.eval_action_q)^2
                with tf.variable_scope("q", reuse=tf.AUTO_REUSE):
                    self.eval_action_q = net_utils.mlp(
                        tf.concat((self.x_ph, self.a_ph), axis=1),
                        list(self.hidden_sizes) + [1],
                        activation=tf.nn.relu,
                        output_activation=None)
            else:
                raise NotImplementedError
        with tf.variable_scope("target_network", reuse=tf.AUTO_REUSE):
            if len(self.ob_space.shape) == 1:
                # use to calc the target q_func Q(s',a=u(s'))
                with tf.variable_scope("pi", reuse=tf.AUTO_REUSE):
                    self.target_policy = net_utils.mlp(
                        self.x_ph,
                        list(self.hidden_sizes) + list(self.ac_space.shape),
                        activation=tf.nn.relu,
                        output_activation=tf.nn.tanh) * self.act_limit
                with tf.variable_scope("q", reuse=tf.AUTO_REUSE):
                    self.target_policy_q = net_utils.mlp(
                        tf.concat((self.x_ph, self.target_policy), axis=1),
                        list(self.hidden_sizes) + [1],
                        activation=tf.nn.relu,
                        output_activation=None)
            else:
                raise NotImplementedError

        self.pi_params = net_utils.get_vars("DDPG/eval_network/pi")
        self.q_params = net_utils.get_vars("DDPG/eval_network/q")
        self.eval_params = net_utils.get_vars("DDPG/eval_network")
        self.target_params = net_utils.get_vars("DDPG/target_network")

        self.target_update_op = [
            tf.assign(tp, self.tau * ep + (1 - self.tau) * tp)
            for ep, tp in zip(self.eval_params, self.target_params)
        ]
        self.target_init_op = [
            tf.assign(tp, ep)
            for ep, tp in zip(self.eval_params, self.target_params)
        ]
Example #2
0
    def build_update(self):
        """
        Select the network variables and build the operation to copy main
        weights and biases to the target network.
        """
        self.main_vars = get_vars('main_network', trainable=True)
        self.target_vars = get_vars('target_network', trainable=False)

        # Initial operation to start with target_net == main_net
        self.init_target_op = copy_vars(self.main_vars, self.target_vars, 1,
                                        'init_target')

        self.target_update = copy_vars(self.main_vars, self.target_vars,
                                       Settings.UPDATE_TARGET_RATE,
                                       'update_target')
Example #3
0
 def _define_network_(self):
     with tf.variable_scope("network",reuse=tf.AUTO_REUSE):
         if len(self.ob_space.shape) == 3: #use convnet
             self.pi, self.logp, self.logp_pi, self.pi_entropy,self.v = net_utils.cnn_actor_critic(self.x_ph,self.a_ph,self.hidden_sizes,activation=tf.nn.tanh,action_space=self.ac_space)
         elif len(self.ob_space.shape) == 1: #use mlpnet
             self.pi,self.logp,self.logp_pi,self.pi_entropy,self.v = net_utils.mlp_actor_critic(self.x_ph,self.a_ph,self.hidden_sizes,activation=tf.nn.tanh,action_space=self.ac_space) 
     self.get_action_ops = [self.pi,self.v,self.logp_pi]                
     self.params = net_utils.get_vars("PPO/network")
Example #4
0
    def build_update(self):
        """
        Build the operation to copy the weights of the learner's actor network
        in the agent's network.
        """
        with self.sess.as_default(), self.sess.graph.as_default():

            self.network_vars = get_vars('learner_actor', trainable=True)
            self.update = copy_vars(self.network_vars, self.vars,
                                    1, 'update_agent_'+str(self.n_agent))
Example #5
0
    def build_actor(self):
        """
        Build a copy of the learner's actor network to allow the agent to
        interact with the environment on its own.
        """
        scope = 'worker_agent_' + str(self.n_agent)
        self.state_ph = tf.placeholder(dtype=tf.float32,
                                       shape=[None, *Settings.STATE_SIZE],
                                       name='state_ph')

        # Get the policy prediction network
        self.policy = build_actor(self.state_ph, trainable=False, scope=scope)
        self.vars = get_vars(scope, trainable=False)
Example #6
0
    def build_update(self):
        """
        Select the network variables and build the operation to copy main
        weights and biases to the target network.
        """
        # Isolate vars for each network
        self.actor_vars = get_vars('actor', trainable=True)
        self.critic_vars = get_vars('critic', trainable=True)
        self.vars = self.actor_vars + self.critic_vars

        self.target_actor_vars = get_vars('target_actor', trainable=False)
        self.target_critic_vars = get_vars('target_critic', trainable=False)
        self.target_vars = self.target_actor_vars + self.target_critic_vars

        # Initial operation to start with target_net == main_net
        self.init_target_op = copy_vars(self.vars, self.target_vars, 1,
                                        'init_target')

        # Update values for target vars towards current actor and critic vars
        self.target_update = copy_vars(self.vars, self.target_vars,
                                       Settings.UPDATE_TARGET_RATE,
                                       'target_update')
Example #7
0
    def build_update(self):
        """
        Build the operation to copy the weights of the learner's actor network
        in the agent's network.
        """
        with self.sess.as_default(), self.sess.graph.as_default():

            self.network_vars = get_vars('learner_actor', trainable=True)
            # update agent with the newest actor values
            # copy_vars copies the values of network_vars to self.vars with update rate '1'
            self.update = copy_vars(
                self.network_vars,
                self.
                vars,  # src, dst, update copy actor with most recent weights
                1,
                'update_agent_' + str(self.n_agent))