Beispiel #1
0
def run(args):

    log_dir = args.dir_path

    env = gym.make(args.env)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = int(env.action_space.high[0])

    env.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    ddpg = DDPG(state_dim, action_dim, max_action, args)
    ounoise = OUNoise(action_dim)

    def get_action(state, noise=None):
        action = ddpg.actor(FloatTensor(state))
        action = (action.data.numpy() +
                  noise.add()) if noise else action.data.numpy()
        return np.clip(action, -max_action, max_action)

    def rollout(eval=False):
        state, done, ep_reward, ep_len = env.reset(), False, 0.0, 0
        while not done and ep_len < args.max_ep_len:
            if not eval:
                action = get_action(state, noise=ounoise)
            else:
                action = get_action(state)
            next_state, reward, done, _ = env.step(action)
            if not eval:
                done = False if ep_len + 1 == args.max_ep_len else done
                ddpg.replay_buffer.store(
                    (state, next_state, action, reward, done))
            ep_reward += reward
            ep_len += 1
            state = next_state
        return ep_reward, ep_len

    for epoch in range(args.epochs):
        ep_reward, ep_len = rollout(eval=False)
        if epoch > args.start_epoch:
            for _ in range(ep_len):
                ddpg.train()
                ddpg.update_nets()

        if epoch % args.save_freq == 0:
            test_rewards = []
            for i in range(10):
                reward, _ = rollout()
                test_rewards.append(reward)
            test_rewards = np.array(test_rewards)

            np.savez(log_dir + '/policy_weights', ddpg.actor.get_params())
            logz.log_tabular("Epoch", epoch)
            logz.log_tabular("AverageTestReward", np.mean(test_rewards))
            logz.log_tabular("StdTestRewards", np.std(test_rewards))
            logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards))
            logz.log_tabular("MinTestRewardRollout", np.min(test_rewards))
            logz.dump_tabular()
Beispiel #2
0
    def train(self):
        """ Runs training. See the in-line comments. """
        args = self.args
        t_start = time.time()
        num_mbs = len(self.data_mb_list['X_train'])

        for ii in range(args.train_iters):
            # Sample minibatch and form feed dictionary.
            real_BO = self.data_mb_list['X_train'][ii % num_mbs]
            prior_BP = self._sample_prior()
            feed = {self.D_data_BO: real_BO, self.G_data_BP: prior_BP}

            # Update Generator and Discriminator, I think they should be separate.
            _, loss_D = self.sess.run([self.train_D_op, self.loss_D], feed)
            _, loss_G = self.sess.run([self.train_G_op, self.loss_G], feed)

            if (ii % args.log_every_t_iter == 0):
                print("\n  ************ Iteration %i ************" % ii)
                # --------------------------------------------------------------
                # Logging. Also record time and get a fresh set of real vs fake
                # images to evaluate (but NOT train) the Discriminator.
                # --------------------------------------------------------------
                new_feed = {
                    self.D_data_BO:
                    self.data_mb_list['X_train'][(ii + 1) % num_mbs],
                    self.G_data_BP:
                    self._sample_prior()
                }
                dout_real, dout_fake = \
                        self.sess.run([self.D_real_B, self.D_fake_B], new_feed)
                num_correct = np.sum(dout_real > 0.0) + np.sum(dout_fake < 0.0)
                elapsed_time_hours = (time.time() - t_start) / (60.0**2)

                logz.log_tabular("AvgRealScore", np.mean(dout_real))
                logz.log_tabular("AvgFakeScore", np.mean(dout_fake))
                logz.log_tabular("LossDis", loss_D)
                logz.log_tabular("LossGen", loss_G)
                logz.log_tabular("DisNumCorrect", num_correct)
                logz.log_tabular("TimeHours", elapsed_time_hours)
                logz.log_tabular("Iterations", ii)
                logz.dump_tabular()

            if (ii % args.snapshot_every_t_iter == 0) and (self.log_dir
                                                           is not None):
                # --------------------------------------------------------------
                # See if we're making cool images and also save weights.
                # Unfortunately some of this is highly specific to MNIST...
                # Don't worry about the reshaping order because all that the
                # computer sees is just the 784-dimensional vector (for now).
                # --------------------------------------------------------------
                bs = args.test_cols * args.test_rows
                dims = int(np.sqrt(self.odim))
                prior = np.random.standard_normal(size=(bs, self.prior_dim))
                gen_out_BO = self.sess.run(self.G_out_BO,
                                           {self.G_data_BP: prior})
                gen_out_BDD = np.reshape(gen_out_BO, (bs, dims, dims))
                weights_v = self.sess.run(self.weights_v)
                self._save_snapshot(ii, weights_v, gen_out_BDD)
Beispiel #3
0
    def train(self):
        """ Runs training. See the in-line comments. """
        args = self.args
        t_start = time.time()
        num_mbs = len(self.data_mb_list['X_train'])

        for ii in range(args.train_iters):
            # Sample minibatch + standard Gaussian noise and form feed.
            real_BO = self.data_mb_list['X_train'][ii % num_mbs]
            std_norm_BZ = np.random.standard_normal((self.bsize,args.latent_dim))
            feed = {self.data_BO: real_BO, self.std_norm_BZ: std_norm_BZ}
            _, neg_lb_loss, kldiv, log_p, first, second, logstd_BO = self.sess.run(
                    [self.train_op, self.neg_lb_llhd, self.kldiv, self.log_p,
                        self.first_B, self.second_B, self.d_logstd_BO], 
                    feed
            )

            if (ii % args.log_every_t_iter == 0):
                print("\n  ************ Iteration %i ************" % ii)
                #print("first {}".format(first))
                #print("second {}".format(second))
                elapsed_time_hours = (time.time() - t_start) / (60.0 ** 2)
                logz.log_tabular("LogProb",    log_p)
                logz.log_tabular("KlDiv",      kldiv)
                logz.log_tabular("NegLbLhd",   neg_lb_loss)
                logz.log_tabular("TimeHours",  elapsed_time_hours)
                logz.log_tabular("Iterations", ii)
                logz.dump_tabular()

            if (ii % args.snapshot_every_t_iter == 0) and (self.log_dir is not None):
                # --------------------------------------------------------------
                # See if we're making cool images and also save weights.
                # Unfortunately some of this is highly specific to MNIST...
                # Don't worry about the reshaping order because all that the
                # computer sees is just the 784-dimensional vector (for now).
                # We use a different batch size here, `bs`.
                # --------------------------------------------------------------
                bs = args.test_cols * args.test_rows
                dims = int(np.sqrt(self.odim))
                latent_BZ = np.random.standard_normal((bs,args.latent_dim))
                feed = {self.latent_BZ: latent_BZ}
                dec_out_BO, dec_logstd_BO = \
                        self.sess.run([self.d_mean_BO, self.d_logstd_BO], feed)

                # With the mean and (log) std, we can sample.
                eps_BO = np.random.standard_normal(size=dec_out_BO.shape)
                sampled_BO = dec_out_BO + (np.exp(dec_logstd_BO) * eps_BO)
                dec_out_BDD = np.reshape(sampled_BO, (bs,dims,dims))
                weights_v = self.sess.run(self.weights_v)
                self._save_snapshot(ii, weights_v, dec_out_BDD)
Beispiel #4
0
    def log_diagnostics(self, paths, infodict, vfdict):
        """ Just logging using the `logz` functionality. """
        ob_no = np.concatenate([path["observation"] for path in paths])
        vpred_n = np.concatenate([path["baseline"] for path in paths])
        vtarg_n = np.concatenate([path["reward"] for path in paths])
        elapsed_time = (time.time() - self.start_time) # In seconds
        episode_rewards = np.array([path["reward"].sum() for path in paths])
        episode_lengths = np.array([utils.pathlength(path) for path in paths])

        # These are *not* logged in John Schulman's code.
        #logz.log_tabular("Success",   infodict["Success"])
        #logz.log_tabular("LagrangeM", infodict["LagrangeM"])
        #logz.log_tabular("gNorm",     infodict["gNorm"])

        # These *are* logged in John Schulman's code. First, rewards:
        logz.log_tabular("NumEpBatch", len(paths))
        logz.log_tabular("EpRewMean",  episode_rewards.mean())
        logz.log_tabular("EpRewMax",   episode_rewards.max())
        logz.log_tabular("EpRewSEM",   episode_rewards.std()/np.sqrt(len(paths)))
        logz.log_tabular("EpLenMean",  episode_lengths.mean())
        logz.log_tabular("EpLenMax",   episode_lengths.max())
        logz.log_tabular("RewPerStep", episode_rewards.sum()/episode_lengths.sum())
        logz.log_tabular("vf_mse_before",      vfdict["MSEBefore"])
        logz.log_tabular("vf_mse_after",       vfdict["MSEAfter"])
        logz.log_tabular("vf_PredStdevBefore", vfdict["PredStdevBefore"])
        logz.log_tabular("vf_PredStdevAfter",  vfdict["PredStdevAfter"])
        logz.log_tabular("vf_TargStdev",       vfdict["TargStdev"])
        logz.log_tabular("vf_EV_before",       utils.explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("vf_EV_after",        utils.explained_variance_1d(self.vf.predict(ob_no), vtarg_n))
        # If overfitting, EVAfter >> EVBefore. Also, we fit the value function
        # _after_ using it to compute the baseline to avoid introducing bias.
        logz.log_tabular("pol_surr_before", infodict["pol_surr_before"])
        logz.log_tabular("pol_surr_after",  infodict["pol_surr_after"])
        logz.log_tabular("pol_kl_before",   infodict["pol_kl_before"])
        logz.log_tabular("pol_kl_after",    infodict["pol_kl_after"])
        logz.log_tabular("pol_ent_before",  infodict["pol_ent_before"])
        logz.log_tabular("pol_ent_after",   infodict["pol_ent_after"])
        logz.log_tabular("TimeElapsed",     elapsed_time)
        logz.dump_tabular()
Beispiel #5
0
def AC_train(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, num_target_updates,
             num_grad_steps_per_target_update, animate, logdir,
             normalize_advantages, seed, n_layers, size):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(PG_train)[0]
    # params = {k: locals()[k] if k in locals() else None for k in args}
    params = locals()
    print(params)
    logz.save_params(params)

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # initialize Policy Gradient Agent
    network_args = {
        'n_layers': n_layers,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update
    }
    env_args = {
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
    }
    sample_traj_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }
    estimate_return_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    # Agent
    agent = ACAgent(network_args, env_args, sample_traj_args,
                    estimate_return_args)

    agent.build_computation_graph()
    agent.init_tf_sess()

    # start training
    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch = agent.sample_trajs(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [len(path["reward"]) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Beispiel #6
0
    def train(self):

        for epoch in range(self.epochs):
            surr_grads = []
            ddpg_grads = 0
            if epoch >= self.start_epoch:
                self.ddpg.actor.set_params(self.policy.w_policy)
                self.ddpg.actor_t.set_params(self.policy.w_policy)

                for step in range(self.rl_train_steps):
                    grad = self.ddpg.train()
                    ddpg_grads += grad
                    if step >= self.rl_train_steps - self.k:
                        surr_grads.append(grad.flatten())

                self.policy.update_by_ddpg(ddpg_grads / self.rl_train_steps)
                # if epoch % 50 == 0:
                #     self.ddpg.replay_buffer.buffer_flush()
                # self.policy.w_policy = self.ddpg.actor.get_params()

                self.noise.update(np.array(surr_grads).T)

            epsilons = self.noise.sample(
                self.pop_size)  # policy_size x pop_size

            pos_rewards, neg_rewards = [], []
            policy_weights = self.policy.w_policy  # action_dim x state_dim
            for epsilon in epsilons:
                self.policy.w_policy = policy_weights + epsilon.reshape(
                    self.policy.w_policy.shape)
                pos_reward, pos_len = self.evaluate()
                pos_rewards.append(pos_reward)

                self.policy.w_policy = policy_weights - epsilon.reshape(
                    self.policy.w_policy.shape)
                neg_reward, neg_len = self.evaluate()
                neg_rewards.append(neg_reward)
            self.policy.w_policy = policy_weights

            std_rewards = np.array(pos_rewards + neg_rewards).std()

            if self.elite_size != 0:
                scores = {
                    k: max(pos_reward, neg_reward)
                    for k, (
                        pos_reward,
                        neg_reward) in enumerate(zip(pos_rewards, neg_rewards))
                }
                sorted_scores = sorted(scores.keys(),
                                       key=lambda x: scores[x],
                                       reverse=True)[:self.elite_size]
                elite_pos_rewards = [pos_rewards[k] for k in sorted_scores]
                elite_neg_rewards = [neg_rewards[k] for k in sorted_scores]
                elite_epsilons = [epsilons[k] for k in sorted_scores]
                self.policy.update_by_ges(elite_pos_rewards, elite_neg_rewards,
                                          elite_epsilons, std_rewards)
            else:
                self.policy.update_by_ges(pos_rewards, neg_rewards, epsilons,
                                          std_rewards)

            if epoch % self.save_freq == 0:
                train_rewards = np.array(pos_rewards + neg_rewards)
                test_rewards = []
                for _ in range(10):
                    reward, _ = self.evaluate()
                    test_rewards.append(reward)
                test_rewards = np.array(test_rewards)

                np.savez(self.log_dir + '/policy_weights',
                         self.policy.w_policy)
                logz.log_tabular("Epoch", epoch)
                logz.log_tabular("AverageTrainReward", np.mean(train_rewards))
                logz.log_tabular("StdTrainRewards", np.std(train_rewards))
                logz.log_tabular("MaxTrainRewardRollout",
                                 np.max(train_rewards))
                logz.log_tabular("MinTrainRewardRollout",
                                 np.min(train_rewards))
                logz.log_tabular("AverageTestReward", np.mean(test_rewards))
                logz.log_tabular("StdTestRewards", np.std(test_rewards))
                logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards))
                logz.log_tabular("MinTestRewardRollout", np.min(test_rewards))
                logz.dump_tabular()
Beispiel #7
0
def TD3_train(env,
              logdir='.',
              actor_critic=actor_critic,
              iterations=600000,
              replay_size=int(1e6),
              gamma=0.99,
              polyak=0.995,
              actor_lr=1e-3,
              critic_lr=1e-3,
              batch_size=100,
              start_steps=10000,
              act_noise=0.1,
              target_noise=0.2,
              noise_clip=0.5,
              policy_delay=4):

    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(PG_train)[0]
    # params = {k: locals()[k] if k in locals() else None for k in args}
    params = locals()
    print(params)
    logz.save_params(params)

    td3 = TD3Agent(env, actor_critic, gamma, polyak, actor_lr, critic_lr,
                   act_noise)

    td3.build_computation_graph()
    td3.init_tf_sess()
    td3.graph_initialization()

    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]

    replay_buffer = ReplayBuffer(ob_dim, ac_dim, replay_size)

    start_time = time.time()
    ob = env.reset()
    ac, rew, done = 0, 0, 0
    actor_loss = []
    critic_loss = []

    for ii in range(iterations):

        if ii < start_steps:
            ac = env.action_space.sample()
        else:
            ac = td3.sample_action(ob)

        ob_next, rew, done = env.step(ac)

        replay_buffer.store(ob, ac, rew, ob_next, done)

        if done is True:
            ob = env.reset()

        # if iteration < start_step, only put steps into buffer
        if ii < start_steps:
            continue

        batch = replay_buffer.sample_batch(batch_size=batch_size)

        # update critic
        a_loss = td3.update_critic(batch['obs1'], batch['obs2'], batch['acts'],
                                   batch['rews'], batch['done'])
        actor_loss.append(a_loss)

        if ii % policy_delay == 0:  # Delayed actor update and target update

            # update actor and target
            c_loss = td3.update_actor_and_target(batch['obs1'], batch['obs2'],
                                                 batch['acts'], batch['rews'],
                                                 batch['done'])
            critic_loss.append(c_loss)

        if ii % 10000 == 0:
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", ii)
            logz.log_tabular("AverageActorLoss", np.mean(np.array(actor_loss)))
            logz.log_tabular("AverageCriticLoss",
                             np.mean(np.array(critic_loss)))
            logz.log_tabular("AverageActorStd", np.std(np.array(actor_loss)))
            logz.log_tabular("AverageCriticStd", np.std(np.array(critic_loss)))
            logz.dump_tabular()
            logz.pickle_tf_vars()
Beispiel #8
0
def train_PG(exp_name, env_name, n_iters, gamma, min_timesteps_per_batch,
             max_path_length, lr, normalize_advantages, nn_baseline, seed,
             n_layers, hidden_size, discrete, logdir):

    start = time.time()

    # env
    env = gym.make(env_name)
    #TODO:
    # env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \
    #     timeout=5, realworkercount=8)
    # env.state_size = 1
    # env.action_size = 2

    # set up logger
    setup_logger(logdir, locals())

    # random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    if hasattr(env, 'seed'):
        env.seed(seed)

    # sete attributes
    if isinstance(env, gym.Env):
        max_path_length = max_path_length or env.spec.max_episode_steps
        discrete = isinstance(env.action_space, gym.spaces.Discrete)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n if discrete else env.action_space.shape[
            0]
    else:
        if hasattr(env, 'state_size'):
            state_size = env.state_size
        else:
            raise Exception(
                "Environment has attribute state_size or use gym.Env!")
        if hasattr(env, 'action_size'):
            action_size = env.action_size
        else:
            raise Exception(
                "Environment has attribute action_size or use gym.Env!")

    net_args = {
        "n_layers": n_layers,
        "state_size": state_size,
        "action_size": action_size,
        "discrete": discrete,
        "hidden_size": hidden_size,
        "learing_rate": lr,
        "output_activation": nn.Sigmoid()
    }

    trajectory_args = {
        "max_path_length": max_path_length,
        "min_timesteps_per_batch": min_timesteps_per_batch
    }

    reward_args = {
        "gamma": gamma,
        "nn_baseline": nn_baseline,
        "normalize_advantage": normalize_advantages
    }

    agent = Agent(net_args, trajectory_args, reward_args)

    # create networks
    agent.build_net()

    total_timesteps = 0
    for it in range(n_iters):
        print("=============Iteration {}==============".format(it))
        paths, timesteps_this_batch = agent.sample_trajectories(it, env)
        #TODO:
        # env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \
        #     timeout=5, realworkercount=8)
        total_timesteps += timesteps_this_batch

        states = np.concatenate([path["state"] for path in paths])
        actions = np.concatenate([path["action"] for path in paths])
        rewards = [path["reward"] for path in paths]

        states_input = torch.Tensor(states).float()
        actions_input = torch.Tensor(actions).float()
        # q_n, adv = agent.estimate_return(states_input, rewards)
        # agent.train_op(states_input, actions_input, q_n, adv)
        agent.train_op()

        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        # best_idx = np.argmax(returns)
        # best_path = paths[best_idx]
        # best_policy = {}
        # for i in range(5):
        #     best_policy[str(i+1)] = best_path["action"][i].tolist()
        # data = {"best_policy": [best_policy], "best_reward": returns[best_idx]}
        # data = pd.DataFrame(data)
        # if os.path.exists("best_policy_pg.csv"):
        #     policy_df = pd.read_csv("best_policy_pg.csv")
        #     policy_df.loc[len(policy_df)] = [best_policy, returns[best_idx]]
        # else:
        #     policy_df = data
        # policy_df.to_csv("best_policy_pg.csv", index=False)

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", it)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
Beispiel #9
0
    def train(self):
        """ 
        Algorithm 1 in the DDPG paper. 
        """
        num_episodes = 0
        t_start = time.time()
        obs = self.env.reset()

        for t in range(self.args.n_iter):
            if (t % self.args.log_every_t_iter
                    == 0) and (t > self.args.wait_until_rbuffer):
                print("\n*** DDPG Iteration {} ***".format(t))

            # Sample actions with noise injection and manage buffer.
            act = self.actor.sample_action(obs, train=True)
            new_obs, rew, done, info = self.env.step(act)
            self.rbuffer.add_sample(s=obs, a=act, r=rew, done=done)
            if done:
                obs = self.env.reset()
                num_episodes += 1
            else:
                obs = new_obs

            if (t > self.args.wait_until_rbuffer) and (
                    t % self.args.learning_freq == 0):
                # Sample from the replay buffer.
                states_t_BO, actions_t_BA, rewards_t_B, states_tp1_BO, done_mask_B = \
                        self.rbuffer.sample(num=self.args.batch_size)

                feed = {
                    'obs_t_BO': states_t_BO,
                    'act_t_BA': actions_t_BA,
                    'rew_t_B': rewards_t_B,
                    'obs_tp1_BO': states_tp1_BO,
                    'done_mask_B': done_mask_B
                }

                # Update the critic, get sampled policy gradients, update actor.
                a_grads_BA, l2_error = self.critic.update_weights(feed)
                actor_gradients = self.actor.update_weights(feed, a_grads_BA)

                # Update both target networks.
                self.critic.update_target_net()
                self.actor.update_target_net()

            if (t % self.args.log_every_t_iter
                    == 0) and (t > self.args.wait_until_rbuffer):
                # Do some rollouts here and then record statistics.  Note that
                # some of these stats rely on stuff computed from sampling the
                # replay buffer, so be careful interpreting these. The code
                # probably needs to guard against this case as well.
                stats = self._do_rollouts()
                hours = (time.time() - t_start) / (60 * 60.)
                logz.log_tabular("MeanReward", np.mean(stats['reward']))
                logz.log_tabular("MaxReward", np.max(stats['reward']))
                logz.log_tabular("MinReward", np.min(stats['reward']))
                logz.log_tabular("StdReward", np.std(stats['reward']))
                logz.log_tabular("MeanLength", np.mean(stats['length']))
                logz.log_tabular("NumTrainingEps", num_episodes)
                logz.log_tabular("L2ErrorCritic", l2_error)
                logz.log_tabular("QaGradL2Norm", np.linalg.norm(a_grads_BA))
                logz.log_tabular("TimeHours", hours)
                logz.log_tabular("Iterations", t)
                logz.dump_tabular()
Beispiel #10
0
def run_vpg(args, vf_params, logdir, env, sess, continuous_control):
    """ General purpose method to run vanilla policy gradients, for both
    continuous and discrete action environments.

    Parameters
    ----------
    args: [Namespace]
        Contains user-provided (or default) arguments for VPGs.
    vf_params: [dict]
        Dictionary of parameters for the value function.
    logdir: [string]
        Where we store the outputs, can be None to avoid saving.
    env: [OpenAI gym env]
        The environment the agent is in, from OpenAI gym.
    sess: [tf Session]
        Current Tensorflow session, to be passed to (at least) the policy
        function, and the value function as well if it's a neural network.
    continuous_control: [boolean]
        True if continuous control (i.e. actions), false if otherwise.
    """
    ob_dim = env.observation_space.shape[0]

    if args.vf_type == 'linear':
        vf = vfuncs.LinearValueFunction(**vf_params)
    elif args.vf_type == 'nn':
        vf = vfuncs.NnValueFunction(session=sess, ob_dim=ob_dim, **vf_params)

    if continuous_control:
        ac_dim = env.action_space.shape[0]
        policyfn = policies.GaussianPolicy(sess, ob_dim, ac_dim)
    else:
        ac_dim = env.action_space.n
        policyfn = policies.GibbsPolicy(sess, ob_dim, ac_dim)

    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101
    total_timesteps = 0
    stepsize = args.initial_stepsize

    for i in range(args.n_iter):
        print("\n********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps.
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (i % 100 == 0)
                                    and args.render)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = policyfn.sample_action(ob)
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break
            path = {
                "observation": np.array(obs),
                "terminated": terminated,
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += utils.pathlength(path)
            if timesteps_this_batch > args.min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Estimate advantage function using baseline vf (these are lists!).
        # return_t: list of sum of discounted rewards (to end of episode), one per time
        # vpred_t: list of value function's predictions of components of return_t
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = utils.discount(rew_t, args.gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update and **re-fit the baseline**.
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        std_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update, plus diagnostics stuff. Is there a better way to handle
        # the continuous vs discrete control cases?
        if continuous_control:
            surr_loss, oldmean_na, oldlogstd_a = policyfn.update_policy(
                ob_no, ac_n, std_adv_n, stepsize)
            kl, ent = policyfn.kldiv_and_entropy(ob_no, oldmean_na,
                                                 oldlogstd_a)
        else:
            surr_loss, oldlogits_na = policyfn.update_policy(
                ob_no, ac_n, std_adv_n, stepsize)
            kl, ent = policyfn.kldiv_and_entropy(ob_no, oldlogits_na)

        # A step size heuristic to ensure that we don't take too large steps.
        if args.use_kl_heuristic:
            if kl > args.desired_kl * 2:
                stepsize /= 1.5
                print('PG stepsize -> %s' % stepsize)
            elif kl < args.desired_kl / 2:
                stepsize *= 1.5
                print('PG stepsize -> %s' % stepsize)
            else:
                print('PG stepsize OK')

        # Log diagnostics
        if i % args.log_every_t_iter == 0:
            logz.log_tabular("EpRewMean",
                             np.mean([path["reward"].sum() for path in paths]))
            logz.log_tabular(
                "EpLenMean",
                np.mean([utils.pathlength(path) for path in paths]))
            logz.log_tabular("KLOldNew", kl)
            logz.log_tabular("Entropy", ent)
            logz.log_tabular("EVBefore",
                             utils.explained_variance_1d(vpred_n, vtarg_n))
            logz.log_tabular(
                "EVAfter",
                utils.explained_variance_1d(vf.predict(ob_no), vtarg_n))
            logz.log_tabular("SurrogateLoss", surr_loss)
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            # If you're overfitting, EVAfter will be way larger than EVBefore.
            # Note that we fit the value function AFTER using it to compute the
            # advantage function to avoid introducing bias
            logz.dump_tabular()
Beispiel #11
0
    def train(self):

        for epoch in range(self.epochs):
            # Sample noises from the noise generator.
            epsilons = self.noise.sample(self.pop_size)

            pos_rewards, neg_rewards = [], []
            policy_weights = self.policy.w_policy
            # Generate 2 * pop_size policies and rollouts.
            for epsilon in epsilons:
                self.policy.w_policy = policy_weights + self.noise_std * epsilon
                pos_reward, pos_len = self.evaluate()
                pos_rewards.append(pos_reward)

                self.policy.w_policy = policy_weights - self.noise_std * epsilon
                neg_reward, neg_len = self.evaluate()
                neg_rewards.append(neg_reward)
            self.policy.w_policy = policy_weights

            std_rewards = np.array(pos_rewards + neg_rewards).std()

            # ARS update
            if self.elite_size != 0:
                scores = {
                    k: max(pos_reward, neg_reward)
                    for k, (
                        pos_reward,
                        neg_reward) in enumerate(zip(pos_rewards, neg_rewards))
                }
                sorted_scores = sorted(scores.keys(),
                                       key=lambda x: scores[x],
                                       reverse=True)[:self.elite_size]
                elite_pos_rewards = [pos_rewards[k] for k in sorted_scores]
                elite_neg_rewards = [neg_rewards[k] for k in sorted_scores]
                elite_epsilons = [epsilons[k] for k in sorted_scores]
                self.policy.update(elite_pos_rewards, elite_neg_rewards,
                                   elite_epsilons, std_rewards)
            else:

                self.policy.update(pos_rewards, neg_rewards, epsilons,
                                   std_rewards)

            # Save policy and log the information
            if epoch % self.save_freq == 0:
                train_rewards = np.array(pos_rewards + neg_rewards)
                test_rewards = []
                for _ in range(10):
                    reward, _ = self.evaluate()
                    test_rewards.append(reward)
                test_rewards = np.array(test_rewards)

                np.savez(self.log_dir + '/policy_weights',
                         self.policy.w_policy)
                logz.log_tabular("Epoch", epoch)
                logz.log_tabular("AverageTrainReward", np.mean(train_rewards))
                logz.log_tabular("StdTrainRewards", np.std(train_rewards))
                logz.log_tabular("MaxTrainRewardRollout",
                                 np.max(train_rewards))
                logz.log_tabular("MinTrainRewardRollout",
                                 np.min(train_rewards))
                logz.log_tabular("AverageTestReward", np.mean(test_rewards))
                logz.log_tabular("StdTestRewards", np.std(test_rewards))
                logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards))
                logz.log_tabular("MinTestRewardRollout", np.min(test_rewards))
                logz.dump_tabular()