Esempio n. 1
0
    def run(self):
        self.env = self.make_env(self.env_id)
        self.env.seed = self.seed

        self.continuous_actions = hasattr(self.env.action_space, "shape")

        # tensorflow variables (same as in model.py)
        observation_size = list(self.env.observation_space.shape)
        if self.stacked_frames > 0:
            observation_size += [self.stacked_frames]
        hidden_size = 64
        self.action_size = np.prod(
            self.env.action_space.shape
        ) if self.continuous_actions else self.env.action_space.n

        # tensorflow model of the policy
        self.obs = tf.placeholder(tf.float32, [None] + observation_size)

        self.policy_vars, self.avg_action_dist, self.logstd_action_dist = make_network(
            "policy-a", self.obs, hidden_size, self.action_size)

        config = tf.ConfigProto(device_count={'GPU': 0})
        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        while True:
            next_task = self.task_q.get(block=True)
            if next_task == "do_rollout":
                # the task is an actor request to collect experience
                path = self.rollout()
                self.task_q.task_done()
                self.result_q.put(path)
            elif next_task == "kill":
                print(
                    "Received kill message for rollout process. Shutting down..."
                )
                self.task_q.task_done()
                break
            else:
                # the task is to set parameters of the actor policy
                self.set_policy(next_task)

                # super hacky method to make sure when we fill the queue with set parameter tasks,
                # an actor doesn't finish updating before the other actors can accept their own tasks.
                sleep(0.1)
                self.task_q.task_done()
Esempio n. 2
0
    def run(self):

        # tensorflow variables (same as in model.py)
        observation_size = 37
        hidden_size = 200
        action_size = 2
        self.env = AgentTorcs2(self.env_id,
                               bots=['scr_server'],
                               track='road/g-track-1',
                               text_mode=False,
                               laps=3,
                               torcsIdxOffset=100,
                               screen_capture=True)
        # agent = AgentTorcs2(aidx, bots=['scr_server', 'olethros', 'berniw', 'bt', 'damned'], track='road/g-track-1', text_mode=True)
        self.env.reset()

        # tensorflow model of the policy
        self.obs = tf.placeholder(tf.float32, [None, observation_size])

        self.policy_vars, self.avg_action_dist, self.logstd_action_dist = make_network(
            "policy-a", self.obs, hidden_size, action_size)

        config = tf.ConfigProto(device_count={'GPU': 0})
        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        while True:
            next_task = self.task_q.get(block=True)
            if next_task == "do_rollout":
                # the task is an actor request to collect experience
                path = self.rollout()
                self.task_q.task_done()
                self.result_q.put(path)
            elif next_task == "kill":
                print("Received kill message. Shutting down...")
                self.task_q.task_done()
                break
            else:
                # the task is to set parameters of the actor policy
                self.set_policy(next_task)

                # super hacky method to make sure when we fill the queue with set parameter tasks,
                # an actor doesn't finish updating before the other actors can accept their own tasks.
                sleep(0.1)
                self.task_q.task_done()
Esempio n. 3
0
    def __init__(self, env_id, make_env, max_kl, discount_factor, cg_damping):
        self.max_kl = max_kl
        self.discount_factor = discount_factor
        self.cg_damping = cg_damping

        env = make_env(env_id)
        observation_size = env.observation_space.shape[0]
        hidden_size = 64
        action_size = np.prod(env.action_space.shape)

        self.obs = tf.placeholder(tf.float32, [None, observation_size])
        self.action = tf.placeholder(tf.float32, [None, action_size])
        self.advantage = tf.placeholder(tf.float32, [None])
        self.old_avg_action_dist = tf.placeholder(tf.float32,
                                                  [None, action_size])
        self.old_logstd_action_dist = tf.placeholder(tf.float32,
                                                     [None, action_size])

        self.policy_vars, self.avg_action_dist, self.logstd_action_dist = utils.make_network(
            "policy", self.obs, hidden_size, action_size)

        batch_size = tf.shape(self.obs)[0]
        # what are the probabilities of taking self.action, given new and old distributions
        log_p_n = utils.gauss_log_prob(self.avg_action_dist,
                                       self.logstd_action_dist, self.action)
        log_oldp_n = utils.gauss_log_prob(self.old_avg_action_dist,
                                          self.old_logstd_action_dist,
                                          self.action)

        # tf.exp(log_p_n) / tf.exp(log_oldp_n)
        ratio = tf.exp(log_p_n - log_oldp_n)

        # importance sampling of surrogate loss (L in paper)
        surr = -tf.reduce_mean(ratio * self.advantage)

        batch_size_float = tf.cast(batch_size, tf.float32)
        # kl divergence and shannon entropy
        kl = utils.gauss_KL(self.old_avg_action_dist,
                            self.old_logstd_action_dist, self.avg_action_dist,
                            self.logstd_action_dist) / batch_size_float
        ent = utils.gauss_ent(self.avg_action_dist,
                              self.logstd_action_dist) / batch_size_float

        self.losses = [surr, kl, ent]
        self.policy_gradient = utils.flatgrad(surr, self.policy_vars)

        # KL divergence w/ itself, with first argument kept constant.
        kl_firstfixed = utils.gauss_selfKL_firstfixed(
            self.avg_action_dist, self.logstd_action_dist) / batch_size_float
        # gradient of KL w/ itself
        grads = tf.gradients(kl_firstfixed, self.policy_vars)
        # what vector we're multiplying by
        self.flat_tangent = tf.placeholder(tf.float32, [None])
        start = 0
        tangents = []
        for var in self.policy_vars:
            size = tf.size(var)
            param = tf.reshape(self.flat_tangent[start:(start + size)],
                               var.shape)
            tangents.append(param)
            start += size

        # gradient of KL w/ itself * tangent
        gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
        # 2nd gradient of KL w/ itself * tangent
        self.fvp = utils.flatgrad(gvp, self.policy_vars)

        config = tf.ConfigProto(device_count={'GPU': 0})
        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())
        # value function
        # self.vf = VF(self.session)
        self.vf = LinearVF()

        # Operations
        # _get_flat_params
        self._get_flat_params = tf.concat(
            axis=0,
            values=[tf.reshape(v, [tf.size(v)]) for v in self.policy_vars])

        # _set_from_flat
        total_size = sum(
            np.prod(v.get_shape().as_list()) for v in self.policy_vars)
        self._theta_op = tf.placeholder(tf.float32, [total_size])
        start = 0
        assigns = []
        for var in self.policy_vars:
            size = tf.size(var)
            assigns.append(
                tf.assign(
                    var,
                    tf.reshape(self._theta_op[start:start + size], var.shape)))
            start += size
        self._set_from_flat = tf.group(*assigns)

        # get_policy
        self.get_policy = [
            var for var in self.policy_vars if 'policy' in var.name
        ]
Esempio n. 4
0
    def __init__(self, env_id, make_env, stacked_frames, max_kl,
                 discount_factor, cg_damping):
        self.max_kl = max_kl
        self.discount_factor = discount_factor
        self.cg_damping = cg_damping

        env = make_env(env_id)
        continuous_actions = hasattr(env.action_space, "shape")

        observation_size = list(env.observation_space.shape)
        if stacked_frames > 0:
            observation_size += [stacked_frames]
        hidden_size = 64
        action_size = np.prod(env.action_space.shape
                              ) if continuous_actions else env.action_space.n

        self.obs = tf.placeholder(tf.float32, [None] + observation_size)
        self.action = tf.placeholder(tf.float32, [None, action_size])
        self.advantage = tf.placeholder(tf.float32, [None])
        self.old_avg_action_dist = tf.placeholder(tf.float32,
                                                  [None, action_size])
        self.old_logstd_action_dist = tf.placeholder(tf.float32,
                                                     [None, action_size])

        self.policy_vars, self.avg_action_dist, self.logstd_action_dist = utils.make_network(
            "policy", self.obs, hidden_size, action_size)

        batch_size = tf.shape(self.obs)[0]
        # what are the probabilities of taking self.action, given new and old distributions
        log_p_n = utils.gauss_log_prob(self.avg_action_dist,
                                       self.logstd_action_dist, self.action)
        log_oldp_n = utils.gauss_log_prob(self.old_avg_action_dist,
                                          self.old_logstd_action_dist,
                                          self.action)

        # tf.exp(log_p_n) / tf.exp(log_oldp_n)
        ratio = tf.exp(log_p_n - log_oldp_n)

        # importance sampling of surrogate loss (L in paper)
        surr = -tf.reduce_mean(ratio * self.advantage)

        batch_size_float = tf.cast(batch_size, tf.float32)
        # kl divergence and shannon entropy
        kl = utils.gauss_KL(self.old_avg_action_dist,
                            self.old_logstd_action_dist, self.avg_action_dist,
                            self.logstd_action_dist) / batch_size_float
        ent = utils.gauss_ent(self.avg_action_dist,
                              self.logstd_action_dist) / batch_size_float

        self.losses = [surr, kl, ent]
        self.policy_gradient = utils.flatgrad(surr, self.policy_vars)

        # KL divergence w/ itself, with first argument kept constant.
        kl_firstfixed = utils.gauss_selfKL_firstfixed(
            self.avg_action_dist, self.logstd_action_dist) / batch_size_float
        # gradient of KL w/ itself
        grads = tf.gradients(kl_firstfixed, self.policy_vars)
        # what vector we're multiplying by
        self.flat_tangent = tf.placeholder(tf.float32, [None])
        start = 0
        tangents = []
        for var in self.policy_vars:
            size = tf.size(var)
            param = tf.reshape(self.flat_tangent[start:(start + size)],
                               var.shape)
            tangents.append(param)
            start += size

        # gradient of KL w/ itself * tangent
        gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
        # 2nd gradient of KL w/ itself * tangent
        self.fvp = utils.flatgrad(gvp, self.policy_vars)

        config = tf.ConfigProto(device_count={'GPU': 0})
        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())
        # value function
        # self.vf = VF(self.session)
        self.vf = LinearVF()