Ejemplo n.º 1
0
def cpd_newton(size, rank):
    dim = 3

    for datatype in BACKEND_TYPES:
        T.set_backend(datatype)

        A_list, input_tensor, loss, residual = cpd_graph(dim, size, rank)
        A, B, C = A_list
        v_A = ad.Variable(name="v_A", shape=[size, rank])
        v_B = ad.Variable(name="v_B", shape=[size, rank])
        v_C = ad.Variable(name="v_C", shape=[size, rank])
        grads = ad.gradients(loss, [A, B, C])
        Hvps = ad.hvp(output_node=loss,
                      node_list=[A, B, C],
                      vector_list=[v_A, v_B, v_C])

        executor_grads = ad.Executor([loss] + grads)
        executor_Hvps = ad.Executor(Hvps)

        A_list, input_tensor_val = init_rand_cp(dim, size, rank)
        A_val, B_val, C_val = A_list

        for i in range(100):

            def hess_fn(v):
                return executor_Hvps.run(
                    feed_dict={
                        A: A_val,
                        B: B_val,
                        C: C_val,
                        input_tensor: input_tensor_val,
                        v_A: v[0],
                        v_B: v[1],
                        v_C: v[2]
                    })

            loss_val, grad_A_val, grad_B_val, grad_C_val = executor_grads.run(
                feed_dict={
                    A: A_val,
                    B: B_val,
                    C: C_val,
                    input_tensor: input_tensor_val
                })

            delta = conjugate_gradient(
                hess_fn=hess_fn,
                grads=[grad_A_val, grad_B_val, grad_C_val],
                error_tol=1e-9,
                max_iters=250)

            A_val -= delta[0]
            B_val -= delta[1]
            C_val -= delta[2]
            print(f'At iteration {i} the loss is: {loss_val}')
Ejemplo n.º 2
0
Archivo: TRPO.py Proyecto: elumixor/DRL
def update_agent(rollouts: List[Rollout]) -> None:
    states = torch.cat([r.states for r in rollouts], dim=0)
    actions = torch.cat([r.actions for r in rollouts], dim=0).flatten()

    advantages = [estimate_advantages(critic, states, next_states[-1], rewards) for states, _, rewards, next_states in rollouts]
    advantages = normalize(torch.cat(advantages, dim=0).flatten())

    update_critic(advantages)

    distribution = actor(states)
    distribution = torch.distributions.utils.clamp_probs(distribution)
    probabilities = distribution[range(distribution.shape[0]), actions]

    # Now we have all the data we need for the algorithm

    # We will calculate the gradient wrt to the new probabilities (surrogate function),
    # so second probabilities should be treated as a constant
    L = surrogate_loss(probabilities, probabilities.detach(), advantages)
    KL = kl_div(distribution, distribution)

    parameters = list(actor.parameters())

    g = flat_grad(L, actor.parameters(), retain_graph=True)
    d_kl = flat_grad(KL, parameters, create_graph=True)  # Create graph, because we will call backward() on it (for HVP)

    def HVP(v):
        return flat_grad(d_kl @ v, parameters, retain_graph=True)

    search_dir = conjugate_gradient(HVP, g)
    max_length = torch.sqrt(2 * delta / (search_dir @ HVP(search_dir)))
    max_step = max_length * search_dir

    def criterion(step):
        apply_update(step)

        with torch.no_grad():
            distribution_new = actor(states)
            distribution_new = torch.distributions.utils.clamp_probs(distribution_new)
            probabilities_new = distribution_new[range(distribution_new.shape[0]), actions]

            L_new = surrogate_loss(probabilities_new, probabilities, advantages)
            KL_new = kl_div(distribution, distribution_new)

        L_improvement = L_new - L

        if L_improvement > 0 and KL_new <= delta:
            return True

        apply_update(-step)
        return False

    line_search(max_step, criterion, max_iterations=10)
Ejemplo n.º 3
0
    def apply_step(self, *args):
        loss_g, loss_h = args[:2]

        for x in self.params:
            g = jacobian(loss_g, x)
            h = hessian(loss_h, x)

            with torch.no_grad():
                g = g.reshape((-1, 1))
                h = h.reshape((g.shape[0], g.shape[0]))
                dx = conjugate_gradient(h,
                                        g,
                                        n_iterations=self.n_cg,
                                        tol=self.tol).reshape(x.shape)
                x.add_(dx, alpha=-self.lr)
Ejemplo n.º 4
0
def test_HinverseG(backendopt):
    for datatype in backendopt:
        T.set_backend(datatype)

        N = 10
        T.seed(1224)

        A = T.random([N, N])
        A = T.transpose(A) @ A
        A = A + T.identity(N)
        b = T.random([N])

        def hess_fn(x):
            return [T.einsum("ab,b->a", A, x[0])]

        error_tol = 1e-9
        x, = conjugate_gradient(hess_fn, [b], error_tol)
        assert (T.norm(T.abs(T.einsum("ab,b->a", A, x) - b)) <= 1e-4)
Ejemplo n.º 5
0
    def train(self):

        start_time = time.time()

        self.episodes = self.env.generate_episodes(config.NUM_EPISODES, self)

        # Computing returns and estimating advantage function.
        for episode in self.episodes:
            episode["baseline"] = self.value_func.predict(episode)
            episode["returns"] = utils.discount(episode["rewards"],
                                                config.GAMMA)
            episode["advantage"] = episode["returns"] - episode["baseline"]

        # Updating policy.
        actions_dist_n = np.concatenate(
            [episode["actions_dist"] for episode in self.episodes])
        states_n = np.concatenate(
            [episode["states"] for episode in self.episodes])
        actions_n = np.concatenate(
            [episode["actions"] for episode in self.episodes])
        baseline_n = np.concatenate(
            [episode["baseline"] for episode in self.episodes])
        returns_n = np.concatenate(
            [episode["returns"] for episode in self.episodes])

        # Standardize the advantage function to have mean=0 and std=1.
        advantage_n = np.concatenate(
            [episode["advantage"] for episode in self.episodes])
        advantage_n -= advantage_n.mean()
        advantage_n /= (advantage_n.std() + 1e-8)

        # Computing baseline function for next iter.
        print(states_n.shape, actions_n.shape, advantage_n.shape,
              actions_dist_n.shape)
        feed = {
            self.policy.state: states_n,
            self.action: actions_n,
            self.advantage: advantage_n,
            self.policy.pi_theta_old: actions_dist_n
        }

        episoderewards = np.array(
            [episode["rewards"].sum() for episode in self.episodes])

        #print("\n********** Iteration %i ************" % i)

        self.value_func.fit(self.episodes)
        self.theta_old = self.current_theta()

        def fisher_vector_product(p):
            feed[self.flat_tangent] = p
            return self.session.run(self.fisher_vect_prod,
                                    feed) + config.CG_DAMP * p

        self.g = self.session.run(self.surr_loss_grad, feed_dict=feed)

        self.grad_step = utils.conjugate_gradient(fisher_vector_product,
                                                  -self.g)

        self.sAs = .5 * self.grad_step.dot(
            fisher_vector_product(self.grad_step))

        self.beta_inv = np.sqrt(self.sAs / config.MAX_KL)
        self.full_grad_step = self.grad_step / self.beta_inv

        self.negdot_grad_step = -self.g.dot(self.grad_step)

        def loss(th):
            self.set_theta(th)
            return self.session.run(self.surr_loss, feed_dict=feed)

        self.theta = utils.line_search(loss, self.theta_old,
                                       self.full_grad_step,
                                       self.negdot_grad_step / self.beta_inv)
        self.set_theta(self.theta)

        surr_loss_new = -self.session.run(self.surr_loss, feed_dict=feed)
        KL_old_new = self.session.run(self.KL, feed_dict=feed)
        entropy = self.session.run(self.entropy, feed_dict=feed)

        old_new_norm = np.sum((self.theta - self.theta_old)**2)

        if np.abs(KL_old_new) > 2.0 * config.MAX_KL:
            print("Keeping old theta")
            self.set_theta(self.theta_old)

        stats = {}
        stats["L2 of old - new"] = old_new_norm
        stats["Total number of episodes"] = len(self.episodes)
        stats["Average sum of rewards per episode"] = episoderewards.mean()
        stats["Entropy"] = entropy
        exp = utils.explained_variance(np.array(baseline_n),
                                       np.array(returns_n))
        stats["Baseline explained"] = exp
        stats["Time elapsed"] = "%.2f mins" % (
            (time.time() - start_time) / 60.0)
        stats["KL between old and new distribution"] = KL_old_new
        stats["Surrogate loss"] = surr_loss_new
        self.stats.append(stats)
        utils.write_dict(stats)
        save_path = self.saver.save(self.session, "./checkpoints/model.ckpt")
        print('Saved checkpoint to %s' % save_path)
        for k, v in stats.items():
            print(k + ": " + " " * (40 - len(k)) + str(v))
Ejemplo n.º 6
0
def test(learner, args, train_envs, test_envs, log_dir):
    learner_test = network(args.num_layers, args.num_hidden, args.num_bandits)
    batch_sampler = sampler(args.batch_size, args.num_bandits)
    max_kl = args.max_kl
    cg_iters = args.cg_iters
    cg_damping = args.cg_damping
    ls_max_steps = args.ls_max_steps
    ls_backtrack_ratio = args.ls_backtrack_ratio
    train_rew = []
    for i in range(args.num_updates):
        #print(i)
        adapt_params = []
        inner_losses = []
        adapt_episodes = []
        rew_rem = []
        for j in range(args.num_tasks_train):
            e = batch_sampler.sample(train_envs[j], learner)
            inner_loss = learner.cal_loss(e.s, e.a, e.r)
            params = learner.update_params(inner_loss, args.inner_lr,
                                           args.first_order)
            a_e = batch_sampler.sample(train_envs[j], learner, params)
            adapt_params.append(params)
            adapt_episodes.append(a_e)
            inner_losses.append(inner_loss)
            mean_rew = torch.mean(a_e.r).data.numpy()
            rew_rem.append(mean_rew)

        print(np.mean(rew_rem))
        train_rew.append(np.mean(rew_rem))
        old_loss, _, old_pis = learner.surrogate_loss(adapt_episodes,
                                                      inner_losses)
        grads = torch.autograd.grad(old_loss,
                                    learner.parameters(),
                                    retain_graph=True)
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = learner.hessian_vector_product(
            adapt_episodes, inner_losses, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir))
        lagrange_multiplier = torch.sqrt(shs / max_kl)

        step = stepdir / lagrange_multiplier

        # Save the old parameters
        old_params = parameters_to_vector(learner.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 learner.parameters())
            loss, kl, _ = learner.surrogate_loss(adapt_episodes,
                                                 inner_losses,
                                                 old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, learner.parameters())

        if (i + 1) % 10 == 0:
            test_input = torch.FloatTensor([[1]])
            test_output = learner.forward(test_input).data.numpy()[0]
            plt.figure()
            plt.bar(np.arange(len(test_output)), test_output)
            plt.savefig(log_dir + 'figures/before%i.png' % i)
            plt.close()
            for j in range(args.num_tasks_train):
                test_output = learner.forward(test_input,
                                              adapt_params[j]).data.numpy()[0]
                plt.figure()
                plt.bar(np.arange(len(test_output)), test_output)
                plt.savefig(log_dir + 'figures/after%i_%i.png' % (j, i))
                plt.close()

    np.save(log_dir + 'train_rew' + str(args.inner_lr) + '.npy', train_rew)
    plt.figure()
    plt.plot(train_rew)
    plt.show()
    plt.figure()
    plt.plot(train_rew)
    plt.savefig(log_dir + 'train_rew.png')

    return
Ejemplo n.º 7
0
    def learn(self, paths):
        # is it possible to replace A(s,a) with Q(s,a)?
        for path in paths:
            path["baseline"] = self.vf.predict(path)
            path["returns"] = utils.discount(path["rewards"], self.args.gamma)
            path["advantage"] = path["returns"] - path["baseline"]
            # path["advantage"] = path["returns"]

        # puts all the experiences in a matrix: total_timesteps x options
        action_dist_mu = np.concatenate(
            [path["action_dists_mu"] for path in paths])
        action_dist_logstd = np.concatenate(
            [path["action_dists_logstd"] for path in paths])
        obs_n = np.concatenate([path["obs"] for path in paths])
        action_n = np.concatenate([path["actions"] for path in paths])

        # standardize to mean 0 stddev 1
        advant_n = np.concatenate([path["advantage"] for path in paths])
        advant_n -= advant_n.mean()
        advant_n /= (advant_n.std() + 1e-8)

        # train value function / baseline on rollout paths
        self.vf.fit(paths)

        feed_dict = {
            self.obs: obs_n,
            self.action: action_n,
            self.advantage: advant_n,
            self.oldaction_dist_mu: action_dist_mu,
            self.oldaction_dist_logstd: action_dist_logstd
        }

        # parameters
        thprev = self.gf()

        # computes fisher vector product: F * [self.pg]
        def fisher_vector_product(p):
            feed_dict[self.flat_tangent] = p
            return self.session.run(self.fvp,
                                    feed_dict) + p * self.args.cg_damping

        g = self.session.run(self.pg, feed_dict)

        # solve Ax = g, where A is Fisher information metrix and g is gradient of parameters
        # stepdir = A_inverse * g = x
        stepdir = utils.conjugate_gradient(fisher_vector_product, -g)

        # let stepdir =  change in theta / direction that theta changes in
        # KL divergence approximated by 0.5 x stepdir_transpose * [Fisher Information Matrix] * stepdir
        # where the [Fisher Information Matrix] acts like a metric
        # ([Fisher Information Matrix] * stepdir) is computed using the function,
        # and then stepdir * [above] is computed manually.
        shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))

        lm = np.sqrt(shs / self.args.max_kl)
        # if self.args.max_kl > 0.001:
        #     self.args.max_kl *= self.args.kl_anneal

        fullstep = stepdir / lm
        negative_g_dot_steppdir = -g.dot(stepdir)

        def loss(th):
            self.sff(th)
            # surrogate loss: policy gradient loss
            return self.session.run(self.losses[0], feed_dict)

        # finds best parameter by starting with a big step and working backwards
        theta = utils.linesearch(loss, thprev, fullstep,
                                 negative_g_dot_steppdir / lm)
        # i guess we just take a fullstep no matter what
        theta = thprev + fullstep
        self.sff(theta)

        surrogate_after, kl_after, entropy_after = self.session.run(
            self.losses, feed_dict)

        episoderewards = np.array([path["rewards"].sum() for path in paths])
        stats = {}
        stats["Average sum of rewards per episode"] = episoderewards.mean()
        stats["Entropy"] = entropy_after
        stats["max KL"] = self.args.max_kl
        stats["Timesteps"] = sum([len(path["rewards"]) for path in paths])
        # stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
        stats["KL between old and new distribution"] = kl_after
        stats["Surrogate loss"] = surrogate_after
        # print(("\n********** Iteration {} ************".format(i)))
        for k, v in stats.items():
            print(k + ": " + " " * (40 - len(k)) + str(v))

        return stats["Average sum of rewards per episode"]