Ejemplo n.º 1
0
    def create_surr(self):
        p_n = utils.slice_2d(self.policy.pi_theta, tf.range(0, self.N),
                             self.action)
        p_n_old = utils.slice_2d(self.policy.pi_theta_old, tf.range(0, self.N),
                                 self.action)

        # Surrogate Loss
        self.surr_loss = -tf.reduce_mean(p_n / p_n_old * self.advantage)
        self.surr_loss_grad = utils.flatgrad(self.surr_loss, self.var_list)
Ejemplo n.º 2
0
        def hessian_vector_product(p):
            def hvp_fn():
                kl_grad_vector = flatgrad(kl_fn,
                                          self.model.trainable_variables)
                grad_vector_product = tf.reduce_sum(kl_grad_vector * p)
                return grad_vector_product

            fisher_vector_product = flatgrad(
                hvp_fn, self.model.trainable_variables).numpy()
            return fisher_vector_product + (self.cg_damping * p)
Ejemplo n.º 3
0
    def create_functions(self):

        eps = config.EPS
        self.var_list = tf.trainable_variables()

        #print("Before Surr Ok !")
        self.create_surr()

        #self.KL = (tf.reduce_sum(self.policy.pi_theta_old *
        #            tf.log((self.policy.pi_theta_old + eps) /
        #            (self.policy.pi_theta + eps))) / self.Nf)

        self.KL = (tf.reduce_sum(self.policy.pi_theta * tf.log(
            (self.policy.pi_theta + eps) /
            (self.policy.pi_theta_old + eps)))) / self.Nf

        self.entropy = (tf.reduce_sum(
            -self.policy.pi_theta * tf.log(self.policy.pi_theta + eps)) /
                        self.Nf)
        """
        self.KL_firstfixed = tf.reduce_sum(tf.stop_gradient(self.policy.pi_theta)*
                tf.log(tf.stop_gradient(self.policy.pi_theta + eps) /
                (self.policy.pi_theta + eps))) / self.Nf
            """

        self.KL_firstfixed = tf.reduce_sum(self.policy.pi_theta * tf.log(
            (self.policy.pi_theta + eps) /
            (tf.stop_gradient(self.policy.pi_theta + eps)))) / self.Nf

        self.KL_firstfixed_grad = tf.gradients(self.KL_firstfixed,
                                               self.var_list)

        shapes = map(utils.var_shape, self.var_list)

        start = 0
        self.tangents = []

        for shape in shapes:
            size = np.prod(shape)
            param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
            self.tangents.append(param)
            start += size

        self.fisher_vect_prod = (utils.flatgrad([
            tf.reduce_sum(g * t)
            for (g, t) in zip(self.KL_firstfixed_grad, self.tangents)
        ], self.var_list))

        self.current_theta = utils.GetFlat(self.session, self.var_list)

        self.set_theta = utils.SetFromFlat(self.session, self.var_list)

        self.value_func = utils.ValueFunction(self.session)
        self.stats = []
        self.saver = tf.train.Saver()
Ejemplo n.º 4
0
 def hvp_fn():
     kl_grad_vector = flatgrad(kl_fn,
                               self.model.trainable_variables)
     grad_vector_product = tf.reduce_sum(kl_grad_vector * p)
     return grad_vector_product
Ejemplo n.º 5
0
    def train_step(self, episode, obs_all, Gs_all, actions_all,
                   action_probs_all, total_reward, best_reward, entropy, t0):
        def surrogate_loss(theta=None):
            if theta is None:
                model = self.model
            else:
                model = self.tmp_model
                assign_vars(self.tmp_model, theta)
            logits = model(obs)
            action_prob = tf.nn.softmax(logits)
            action_prob = tf.reduce_sum(actions_one_hot * action_prob, axis=1)
            old_logits = self.model(obs)
            old_action_prob = tf.nn.softmax(old_logits)
            old_action_prob = tf.reduce_sum(actions_one_hot * old_action_prob,
                                            axis=1).numpy() + 1e-8
            prob_ratio = action_prob / old_action_prob  # pi(a|s) / pi_old(a|s)
            loss = tf.reduce_mean(
                prob_ratio * advantage) + self.ent_coeff * entropy
            return loss

        def kl_fn(theta=None):
            if theta is None:
                model = self.model
            else:
                model = self.tmp_model
                assign_vars(self.tmp_model, theta)
            logits = model(obs)
            action_prob = tf.nn.softmax(logits).numpy() + 1e-8
            old_logits = self.model(obs)
            old_action_prob = tf.nn.softmax(old_logits)
            return tf.reduce_mean(
                tf.reduce_sum(old_action_prob *
                              tf.math.log(old_action_prob / action_prob),
                              axis=1))

        def hessian_vector_product(p):
            def hvp_fn():
                kl_grad_vector = flatgrad(kl_fn,
                                          self.model.trainable_variables)
                grad_vector_product = tf.reduce_sum(kl_grad_vector * p)
                return grad_vector_product

            fisher_vector_product = flatgrad(
                hvp_fn, self.model.trainable_variables).numpy()
            return fisher_vector_product + (self.cg_damping * p)

        def conjugate_grad(Ax, b):
            """
			Conjugate gradient algorithm
			(see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
			"""
            x = np.zeros_like(b)
            r = b.copy(
            )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
            p = r.copy()
            old_p = p.copy()
            r_dot_old = np.dot(r, r)
            for _ in range(self.cg_iters):
                z = Ax(p)
                alpha = r_dot_old / (np.dot(p, z) + 1e-8)
                old_x = x
                x += alpha * p
                r -= alpha * z
                r_dot_new = np.dot(r, r)
                beta = r_dot_new / (r_dot_old + 1e-8)
                r_dot_old = r_dot_new
                if r_dot_old < self.residual_tol:
                    break
                old_p = p.copy()
                p = r + beta * p
                if np.isnan(x).any():
                    print("x is nan")
                    print("z", np.isnan(z))
                    print("old_x", np.isnan(old_x))
                    print("kl_fn", np.isnan(kl_fn()))
            return x

        def linesearch(x, fullstep):
            fval = surrogate_loss(x)
            for (_n_backtracks, stepfrac) in enumerate(
                    self.backtrack_coeff**np.arange(self.backtrack_iters)):
                xnew = x + stepfrac * fullstep
                newfval = surrogate_loss(xnew)
                kl_div = kl_fn(xnew)
                if np.isnan(kl_div):
                    print("kl is nan")
                    print("xnew", np.isnan(xnew))
                    print("x", np.isnan(x))
                    print("stepfrac", np.isnan(stepfrac))
                    print("fullstep", np.isnan(fullstep))
                if kl_div <= self.delta and newfval >= 0:
                    print("Linesearch worked at ", _n_backtracks)
                    return xnew
                if _n_backtracks == self.backtrack_iters - 1:
                    print("Linesearch failed.", kl_div, newfval)
            return x

        NBATCHES = len(obs_all) // self.BATCH_SIZE
        if len(obs_all) < self.BATCH_SIZE:
            NBATCHES += 1
        for batch_id in range(NBATCHES):
            obs = obs_all[batch_id * self.BATCH_SIZE:(batch_id + 1) *
                          self.BATCH_SIZE]
            Gs = Gs_all[batch_id * self.BATCH_SIZE:(batch_id + 1) *
                        self.BATCH_SIZE]
            actions = actions_all[batch_id * self.BATCH_SIZE:(batch_id + 1) *
                                  self.BATCH_SIZE]
            action_probs = action_probs_all[batch_id *
                                            self.BATCH_SIZE:(batch_id + 1) *
                                            self.BATCH_SIZE]

            Vs = self.value_model(obs).numpy().flatten()
            # advantage = Gs
            advantage = Gs - Vs
            advantage = (advantage - advantage.mean()) / (advantage.std() +
                                                          1e-8)
            actions_one_hot = tf.one_hot(actions,
                                         self.envs[0].action_space.n,
                                         dtype="float64")
            policy_loss = surrogate_loss()
            policy_gradient = flatgrad(surrogate_loss,
                                       self.model.trainable_variables).numpy()

            step_direction = conjugate_grad(hessian_vector_product,
                                            policy_gradient)

            shs = .5 * step_direction.dot(
                hessian_vector_product(step_direction).T)

            lm = np.sqrt(shs / self.delta) + 1e-8
            fullstep = step_direction / lm
            if np.isnan(fullstep).any():
                print("fullstep is nan")
                print("lm", lm)
                print("step_direction", step_direction)
                print("policy_gradient", policy_gradient)

            oldtheta = flatvars(self.model).numpy()

            theta = linesearch(oldtheta, fullstep)

            if np.isnan(theta).any():
                print("NaN detected. Skipping update...")
            else:
                assign_vars(self.model, theta)

            kl = kl_fn(oldtheta)

            history = self.value_model.fit(obs, Gs, epochs=5, verbose=0)
            value_loss = history.history["loss"][-1]

            print(
                f"Ep {episode}.{batch_id}: Rw_mean {total_reward} - Rw_best {best_reward} - PL {policy_loss} - VL {value_loss} - KL {kl} - epsilon {self.epsilon} - time {time.time() - t0}"
            )
        if self.value_model:
            writer = self.writer
            with writer.as_default():
                tf.summary.scalar("reward", total_reward, step=episode)
                tf.summary.scalar("best_reward", best_reward, step=episode)
                tf.summary.scalar("value_loss", value_loss, step=episode)
                tf.summary.scalar("policy_loss", policy_loss, step=episode)
        self.epsilon = self.epsilon_decay(self.epsilon)
Ejemplo n.º 6
0
    def make_model(self):
        self.observation_size = self.observation_space.shape[0]
        self.action_size = np.prod(self.action_space.shape)
        self.hidden_size = 64

        weight_init = tf.random_uniform_initializer(-0.05, 0.05)
        bias_init = tf.constant_initializer(0)

        config = tf.ConfigProto(device_count={'GPU': 0})
        self.session = tf.Session(config=config)

        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.action = tf.placeholder(tf.float32, [None, self.action_size])
        self.advantage = tf.placeholder(tf.float32, [None])
        self.oldaction_dist_mu = tf.placeholder(tf.float32,
                                                [None, self.action_size])
        self.oldaction_dist_logstd = tf.placeholder(tf.float32,
                                                    [None, self.action_size])

        with tf.variable_scope("policy"):
            h1 = utils.fully_connected(self.obs, self.observation_size,
                                       self.hidden_size, weight_init,
                                       bias_init, "policy_h1")
            h1 = tf.nn.relu(h1)
            h2 = utils.fully_connected(h1, self.hidden_size, self.hidden_size,
                                       weight_init, bias_init, "policy_h2")
            h2 = tf.nn.relu(h2)
            h3 = utils.fully_connected(h2, self.hidden_size, self.action_size,
                                       weight_init, bias_init, "policy_h3")
            action_dist_logstd_param = tf.Variable(
                (.01 * np.random.randn(1, self.action_size)).astype(
                    np.float32),
                name="policy_logstd")
        # means for each action
        self.action_dist_mu = h3
        # log standard deviations for each actions
        self.action_dist_logstd = tf.tile(
            action_dist_logstd_param,
            tf.stack((tf.shape(self.action_dist_mu)[0], 1)))

        batch_size = tf.shape(self.obs)[0]
        # what are the probabilities of taking self.action, given new and old distributions
        log_p_n = utils.gauss_log_prob(self.action_dist_mu,
                                       self.action_dist_logstd, self.action)
        log_oldp_n = utils.gauss_log_prob(self.oldaction_dist_mu,
                                          self.oldaction_dist_logstd,
                                          self.action)

        # tf.exp(log_p_n) / tf.exp(log_oldp_n)
        ratio = tf.exp(log_p_n - log_oldp_n)

        # importance sampling of surrogate loss (L in paper)
        surr = -tf.reduce_mean(ratio * self.advantage)
        var_list = tf.trainable_variables()

        batch_size_float = tf.cast(batch_size, tf.float32)
        # kl divergence and shannon entropy
        kl = utils.gauss_KL(self.oldaction_dist_mu, self.oldaction_dist_logstd,
                            self.action_dist_mu,
                            self.action_dist_logstd) / batch_size_float
        ent = utils.gauss_ent(self.action_dist_mu,
                              self.action_dist_logstd) / batch_size_float

        self.losses = [surr, kl, ent]
        # policy gradient
        self.pg = utils.flatgrad(surr, var_list)

        # KL divergence w/ itself, with first argument kept constant.
        kl_firstfixed = utils.gauss_selfKL_firstfixed(
            self.action_dist_mu, self.action_dist_logstd) / batch_size_float
        # gradient of KL w/ itself
        grads = tf.gradients(kl_firstfixed, var_list)
        # what vector we're multiplying by
        self.flat_tangent = tf.placeholder(tf.float32, [None])
        shapes = map(utils.var_shape, var_list)
        start = 0
        tangents = []
        for shape in shapes:
            size = np.prod(shape)
            param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
            tangents.append(param)
            start += size

        # gradient of KL w/ itself * tangent
        gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
        # 2nd gradient of KL w/ itself * tangent
        self.fvp = utils.flatgrad(gvp, var_list)
        # the actual parameter values
        self.gf = utils.GetFlat(self.session, var_list)
        # call this to set parameter values
        self.sff = utils.SetFromFlat(self.session, var_list)
        self.session.run(tf.global_variables_initializer())
        # value function
        # self.vf = VF(self.session)
        self.vf = LinearVF()

        self.get_policy = utils.GetPolicyWeights(self.session, var_list)