Beispiel #1
0
    def train_paths(self, paths, parallel=False, linear_search=True):
        start_time = time.time()
        sample_data = self.storage.process_paths(paths)
        agent_infos = sample_data["agent_infos"]
        obs_n = sample_data["observations"]
        action_n = sample_data["actions"]
        advant_n = sample_data["advantages"]
        n_samples = len(obs_n)
        inds = np.random.choice(n_samples , int(math.floor(n_samples * pms.subsample_factor)) , replace=False)
        # inds = range(n_samples)
        obs_n = obs_n[inds]
        action_n = action_n[inds]
        advant_n = advant_n[inds]
        action_dist_means_n = np.array([agent_info["mean"] for agent_info in agent_infos[inds]])
        action_dist_logstds_n = np.array([agent_info["log_std"] for agent_info in agent_infos[inds]])
        feed = {self.net.obs: obs_n ,
                self.net.advant: advant_n ,
                self.net.old_dist_means_n: action_dist_means_n ,
                self.net.old_dist_logstds_n: action_dist_logstds_n ,
                self.net.action_n: action_n
                }

        episoderewards = np.array([path["rewards"].sum() for path in paths])
        thprev = self.gf()  # get theta_old

        def fisher_vector_product(p):
            feed[self.flat_tangent] = p
            return self.session.run(self.fvp , feed) + pms.cg_damping * p

        g = self.session.run(self.pg , feed_dict=feed)
        stepdir = krylov.cg(fisher_vector_product , -g , cg_iters=pms.cg_iters)
        shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))  # theta
        # if shs<0, then the nan error would appear
        lm = np.sqrt(shs / pms.max_kl)
        fullstep = stepdir / lm
        neggdotstepdir = -g.dot(stepdir)

        def loss(th):
            self.sff(th)
            return self.session.run(self.losses , feed_dict=feed)

        if parallel is True:
            theta = linesearch_parallel(loss , thprev , fullstep , neggdotstepdir / lm)
        else:
            if linear_search:
                theta = linesearch(loss , thprev , fullstep , neggdotstepdir / lm)
            else:
                theta = thprev + fullstep
                if math.isnan(theta.mean()):
                    print shs is None
                    theta = thprev
        stats = {}
        stats["sum steps of episodes"] = sample_data["sum_episode_steps"]
        stats["Average sum of rewards per episode"] = episoderewards.mean()
        stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
        return stats , theta , thprev
def sparse_cg(data):
    A, E, M, _, y0 = data

    def matvec(x):
        Ax = A.dot(x)
        return A.T.dot(sparse.linalg.spsolve(M, Ax)) + E.T.dot(E.dot(x))

    lop = LinearOperator((E.shape[1], E.shape[1]), dtype=float, matvec=matvec)

    ET_b = E.T.dot(y0)
    try:
        out = krylov.cg(lop, ET_b, tol=1.0e-10, maxiter=10000)
        x = out.xk
    except krypy.utils.ConvergenceError:
        x = np.nan
    return x
Beispiel #3
0
def _solve_sparse_cg(A, M, E, y0):
    def matvec(x):
        Ax = A.dot(x)
        return A.T.dot(sparse.linalg.spsolve(M, Ax)) + E.T.dot(E.dot(x))

    lop = LinearOperator(shape=(E.shape[1], E.shape[1]),
                         dtype=float,
                         matvec=matvec)

    ET_b = E.T.dot(y0)
    x, _ = krylov.cg(lop, ET_b, tol=1.0e-10, maxiter=1000)

    # import matplotlib.pyplot as plt
    # plt.semilogy(out.resnorms)
    # plt.grid()
    # plt.show()
    return x
    def learn(self):
        start_time = time.time()

        numeptotal = 0
        while True:
            i = 0
            # Generating paths.
            # print("Rollout")
            self.get_samples(pms.paths_number)
            paths = self.storage.get_paths()  # get_paths
            # Computing returns and estimating advantage function.
            sample_data = self.storage.process_paths(paths)

            agent_infos = sample_data["agent_infos"]
            obs_n = sample_data["observations"]
            action_n = sample_data["actions"]
            advant_n = sample_data["advantages"]
            n_samples = len(obs_n)
            inds = np.random.choice(n_samples,
                                    math.floor(n_samples *
                                               pms.subsample_factor),
                                    replace=False)
            obs_n = obs_n[inds]
            action_n = action_n[inds]
            advant_n = advant_n[inds]
            action_dist_means_n = np.array(
                [agent_info["mean"] for agent_info in agent_infos[inds]])
            action_dist_logstds_n = np.array(
                [agent_info["log_std"] for agent_info in agent_infos[inds]])
            feed = {
                self.network.obs: obs_n,
                self.network.advant: advant_n,
                self.network.old_dist_means_n: action_dist_means_n,
                self.network.old_dist_logstds_n: action_dist_logstds_n,
                self.network.action_dist_logstds_n: action_dist_logstds_n,
                self.network.action_n: action_n
            }

            episoderewards = np.array(
                [path["rewards"].sum() for path in paths])
            average_episode_std = np.mean(np.exp(action_dist_logstds_n))

            # print "\n********** Iteration %i ************" % i
            for iter_num_per_train in range(pms.iter_num_per_train):
                # if not self.train:
                #     print("Episode mean: %f" % episoderewards.mean())
                #     self.end_count += 1
                #     if self.end_count > 100:
                #         break
                if self.train:
                    thprev = self.gf()  # get theta_old

                    def fisher_vector_product(p):
                        feed[self.flat_tangent] = p
                        return self.session.run(self.fvp,
                                                feed) + pms.cg_damping * p

                    g = self.session.run(self.pg, feed_dict=feed)
                    stepdir = krylov.cg(fisher_vector_product,
                                        g,
                                        cg_iters=pms.cg_iters)
                    shs = 0.5 * stepdir.dot(
                        fisher_vector_product(stepdir))  # theta
                    fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs)
                    neggdotstepdir = -g.dot(stepdir)

                    def loss(th):
                        self.sff(th)
                        return self.session.run(self.losses, feed_dict=feed)

                    surr_prev, kl_prev, ent_prev = loss(thprev)
                    mean_advant = np.mean(advant_n)
                    theta = linesearch(loss, thprev, fullstep, neggdotstepdir)
                    self.sff(theta)
                    surrafter, kloldnew, entnew = self.session.run(
                        self.losses, feed_dict=feed)
                    stats = {}
                    numeptotal += len(episoderewards)
                    stats["average_episode_std"] = average_episode_std
                    stats["sum steps of episodes"] = sample_data[
                        "sum_episode_steps"]
                    stats["Total number of episodes"] = numeptotal
                    stats[
                        "Average sum of rewards per episode"] = episoderewards.mean(
                        )
                    # stats["Entropy"] = entropy
                    # exp = explained_variance(np.array(baseline_n), np.array(returns_n))
                    # stats["Baseline explained"] = exp
                    stats["Time elapsed"] = "%.2f mins" % (
                        (time.time() - start_time) / 60.0)
                    stats["KL between old and new distribution"] = kloldnew
                    stats["Surrogate loss"] = surrafter
                    stats["Surrogate loss prev"] = surr_prev
                    stats["entropy"] = ent_prev
                    stats["mean_advant"] = mean_advant
                    log_data = [
                        average_episode_std,
                        len(episoderewards), numeptotal,
                        episoderewards.mean(), kloldnew, surrafter, surr_prev,
                        surrafter - surr_prev, ent_prev, mean_advant
                    ]
                    self.master.logger.log_row(log_data)
                    # for k, v in stats.iteritems():
                    #     print(k + ": " + " " * (40 - len(k)) + str(v))
                    #     # if entropy != entropy:
                    #     #     exit(-1)
                    #     # if exp > 0.95:
                    #     #     self.train = False
            if self.thread_id == 1:
                self.master.save_model("iter" + str(i))
                print episoderewards.mean()
            i += 1
Beispiel #5
0
    def optimize(self, sess, samples_data):
        feed_dict = dict({
            self.local_policy.state:
            samples_data['observations'],
            self.local_policy.actions:
            samples_data['actions'],
            self.local_policy.advantage:
            samples_data['advantages'],
            self.local_policy.old_mean:
            samples_data['mean'],
            self.local_policy.old_log_std:
            samples_data['log_std'],
            self.local_policy.w:
            samples_data['ws']
        })
        if self.verbose >= 1:
            logging.info(str(self.name) + " " + "computing loss before")
            logging.info(str(self.name) + " " + "performing update")
            logging.info(str(self.name) + " " + "computing descent direction")

        #calculate loss gradient g using symbolic experssion
        [self.flat_gradient, loss_before] = sess.run(
            [self.local_policy.grads_flatten, self.local_policy.surr_loss],
            feed_dict=feed_dict)
        #calculate s = A^-1*g by A*g
        descent_direction = cg(self.local_policy,
                               self.flat_gradient,
                               feed_dict,
                               sess,
                               reg_coef=self.cg_reg_coef,
                               cg_iters=self.cg_iterations,
                               verbose=False)
        #calculate A*s
        A_dot_descent_direction = sess.run(self.local_policy.fisher_prod_x_flatten,
                                           feed_dict = dict(feed_dict, **{self.local_policy.xs_flatten:descent_direction})) +\
                                           self.cg_reg_coef * descent_direction
        #calculate line search step = sqrt(2kl/sAs)
        initial_step_size = np.sqrt(
            2.0 * self.kl_step_size *
            (1. /
             (np.abs(descent_direction.dot(A_dot_descent_direction)) + 1e-8)))

        #initial descent step for line search
        flat_descent_step = initial_step_size * descent_direction

        if self.verbose >= 1:
            logging.info(str(self.name) + " " + "descent direction computed")

        prev_param = sess.run(self.local_policy.get_params,
                              feed_dict=feed_dict)

        if self.verbose >= 1:
            logging.info(
                str(self.name) + " " +
                "current log_std: {0}".format(prev_param[-1]))

        #perform line search along flat_descent_step direction to ensure kl divergence smaller than kl_step_size
        #shrink descent step by self.backtrack_ratio
        loss_after = 0
        kl_after = 0
        for n_iter, ratio in enumerate(self.backtrack_ratio**np.arange(
                self.max_backtracks)):
            cur_step = ratio * flat_descent_step
            start = 0
            cur_param = []
            for param in prev_param:
                size = param.flatten().shape[0]
                cur_param.append(param - cur_step[start:start +
                                                  size].reshape(param.shape))
                start += size
            sess.run(
                self.local_policy.assign_op,
                feed_dict={
                    i: d
                    for i, d in zip(self.local_policy.assign_value, cur_param)
                })
            loss_after, kl_after, log_std = sess.run([
                self.local_policy.surr_loss, self.local_policy.mean_kl,
                self.local_policy.log_std
            ],
                                                     feed_dict=feed_dict)
            if np.isnan(kl_after):
                import ipdb
                ipdb.set_trace()
            if loss_after < loss_before and kl_after <= self.kl_step_size:
                break

        if self.verbose >= 1:
            logging.info(str(self.name) + " " + "backtrack iters: %d" % n_iter)
            logging.info(str(self.name) + " " + "optimization finished")

        return loss_before, loss_after, kl_after, ratio * flat_descent_step, log_std