Exemple #1
0
def main(env_name,
         num_iteration,
         gamma,
         lam,
         batch_size,
         max_step,
         kl_targ,
         hid1_mult,
         policy_logvar,
         epochs,
         animate=False):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v2'
        num_iteration: number of total iterations
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        batch_size: number of samples per policy training batch
        max_step: maximum time step each episode
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
        epochs: num of mini-batch iterations
        animate: boolean, True uses env.render() method to animate episode
    """
    killer = GracefulKiller()
    gymEnv = GymEnv(env_name)
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname="SingleProc" + env_name, now=now)
    aigym_path = os.path.join('log-files/gym', env_name, now)
    env = gymEnv.wrapper(aigym_path, force=True, video_callable=False)
    scaler = Scaler(gymEnv.obs_dim)

    policy_size = "large"
    if env_name in ['Humanoid-v2', 'HumanoidStandup-v2', 'Ant-v2']:
        policy_size = "small"
    policy = Policy(gymEnv.obs_dim, gymEnv.act_dim, kl_targ, hid1_mult,
                    policy_logvar, policy_size)
    # sampler to generate rollouts by exex policy on Env
    sampler = Sampler(env, policy, scaler, max_step, batch_size, animate)
    # agent PPO to update policy and generate rollout alteratively
    ppo = PPO(policy, sampler, logger, killer, num_iteration, epochs, gamma,
              lam, max_step)
    # agent policy learning for num_iteration
    ppo.train()
    logger.close()
def main(n_sampler,
         env_name,
         num_iteration,
         gamma,
         lam,
         batch_size,
         max_step,
         kl_targ,
         hid1_mult,
         policy_logvar,
         epochs,
         animate=False):
    """ Main training loop
    Args:
        n_sampler: num of sampler processes to generate rollouts
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v2'
        num_iteration: number of total iterations
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        batch_size: number of samples per policy training batch
        max_step: maximum time step each episode
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
        epochs: num of mini-batch iterations
        animate: boolean, True uses env.render() method to animate episode
    """
    gymEnv = GymEnv(env_name)
    # create unique directories
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    logger = Logger(logname='MultProc' + env_name, now=now)
    policy_size = "large"
    if env_name in ['Humanoid-v2', 'HumanoidStandup-v2', 'Ant-v2']:
        policy_size = "small"
    # MLP policy network
    policy = Policy(gymEnv.obs_dim, gymEnv.act_dim, kl_targ, hid1_mult,
                    policy_logvar, policy_size)

    # rollouts or agent commands
    agent_tasks = JoinableQueue()
    # communcaiton of policy weights
    agent_results = Queue()
    # PPO Agent Process run async
    agent = PPO(agent_tasks, agent_results, policy, logger, num_iteration,
                epochs, gamma, lam, max_step)
    agent.start()

    # generate rollouts in parallel (by each process)
    pSampler = ParallelSampler(n_sampler, env_name, policy, max_step,
                               batch_size, animate)
    # get init policy weights
    agent_tasks.put(1)
    agent_tasks.join()
    init_weights = agent_results.get()
    pSampler.set_policy_weights(init_weights)

    total_time = 0.0
    for iter in range(num_iteration):
        print("-------- Iteration {} ----------".format(iter))
        agent.set_total_time(total_time)

        # runs a bunch of async processes that collect rollouts
        print("-------- Generate rollouts in Parallel ------")
        rollout_start = time.time()
        rollouts = pSampler.gen_rollouts()
        rollout_time = (time.time() - rollout_start) / 60.0

        # agent receive rollouts and update policy async
        learn_start = time.time()
        # save rollouts generating from parallel samplers
        agent_tasks.put(rollouts)
        agent_tasks.join()
        learn_time = (time.time() - learn_start) / 60.0

        # read policy weights from agent Queue
        print("-------- Get policy weights from Agent ------")
        new_policy_weights = agent_results.get()
        #TODO, save policy weights, calc totalsteps
        print("-------- Update policy weights to Samplers -----\n\n")
        pSampler.set_policy_weights(new_policy_weights)
        total_time += (time.time() - rollout_start) / 60.0
        print("Total time: {}  mins, Rollout time: {}, Learn time: {}".format(
            total_time, rollout_time, learn_time))

    logger.close()
    # exit parallel sampler
    pSampler.exit()
    #TODO, save policy weights
    # exit ppo agent
    agent.exit()