Esempio n. 1
0
 def init_memory_buffer(self, params: Dict) -> PPOReplayBuffer:
     params["obs_dim"] = self.state_dim
     params["action_dim"] = self.num_actions
     params["discount_factor"] = self.γ
     return PPOReplayBuffer(**params)
Esempio n. 2
0
def train_PG(
        exp_name,
        env_name,
        n_iter,
        gamma,
        min_timesteps_per_batch,
        mini_batch_size,
        max_path_length,
        learning_rate,
        num_ppo_updates,
        num_value_iters,
        animate,
        logdir,
        normalize_advantages,
        nn_critic,
        seed,
        n_layers,
        size,
        gru_size,
        history,
        num_tasks,
        l2reg,
        recurrent,
        generalized,
        granularity
        ):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    envs = {'pm': PointEnv,
            'pm-obs': ObservedPointEnv,
            }
    env = envs[env_name](num_tasks)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    task_dim = len(env._goal) # rude, sorry

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'task_dim': task_dim,
        'size': size,
        'gru_size': gru_size,
        'learning_rate': learning_rate,
        'history': history,
        'num_value_iters': num_value_iters,
        'l2reg': l2reg,
        'recurrent': recurrent,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
        'generalized': generalized,
        'granularity': granularity,
    }

    estimate_return_args = {
        'gamma': gamma,
        'nn_critic': nn_critic,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

    # build computation graph
    agent.build_computation_graph()


    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    def unpack_sample(data):
        '''
        unpack a sample from the replay buffer
        '''
        ob = data["observations"]
        ac = data["actions"]
        re = data["rewards"]
        hi = data["hiddens"]
        ma = 1 - data["terminals"]
        return ob, ac, re, hi, ma

    # construct PPO replay buffer, perhaps rude to do outside the agent
    ppo_buffer = PPOReplayBuffer(agent.replay_buffer)

    total_timesteps = 0
    for itr in range(n_iter):
        # for PPO: flush the replay buffer!
        ppo_buffer.flush()

        # sample trajectories to fill agent's replay buffer
        print("********** Iteration %i ************"%itr)
        stats = []
        for _ in range(num_tasks):
            s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch)
            total_timesteps += timesteps_this_batch
            stats += s

        # compute the log probs, advantages, and returns for all data in agent's buffer
        # store in ppo buffer for use in multiple ppo updates
        # TODO: should move inside the agent probably
        data = agent.replay_buffer.all_batch()
        ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
        fixed_log_probs = agent.sess.run(agent.sy_lp_n,
            feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na})
        q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks)

        ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n)

        # update with mini-batches sampled from ppo buffer
        for _ in range(num_ppo_updates):

            data = ppo_buffer.random_batch(mini_batch_size)

            ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
            fixed_log_probs = data["log_probs"]
            adv_n = data["advantages"]
            q_n = data["returns"]

            log_probs = agent.sess.run(agent.sy_lp_n,
                feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na})

            agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n)

        # compute validation statistics
        print('Validating...')
        val_stats = []
        for _ in range(num_tasks):
            vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True)
            val_stats += vs

        # save trajectories for viz
        with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f:
            pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL)
        agent.val_replay_buffer.flush()

        # Log TRAIN diagnostics
        returns = [sum(s["rewards"]) for s in stats]
        final_rewards = [s["rewards"][-1] for s in stats]
        ep_lengths = [s['ep_len'] for s in stats]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("FinalReward", np.mean(final_rewards))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)

        # Log VAL diagnostics
        val_returns = [sum(s["rewards"]) for s in val_stats]
        val_final_rewards = [s["rewards"][-1] for s in val_stats]
        logz.log_tabular("ValAverageReturn", np.mean(val_returns))
        logz.log_tabular("ValFinalReward", np.mean(val_final_rewards))

        logz.dump_tabular()
        logz.pickle_tf_vars()