Exemple #1
0
 def __init__(self, num_envs=1, log_dir="", suffix=""):
     self.resized_dim = 42
     env = make_envs(num_envs=1, resized_dim=self.resized_dim)
     self.obs_shape = env.observation_space.shape
     self.agent = PPOTrainer(env, ppo_config)
     if log_dir:  # log_dir is None only in testing
         self.agent.load_w(log_dir, suffix)
     self.num_envs = num_envs
     self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4,
                                         self.agent.device)
Exemple #2
0
                        "-N",
                        default=10,
                        type=int,
                        help="Number of episodes to run.")
    args = parser.parse_args()

    agent_names = get_builtin_agent_names() + ["MY_AGENT"]

    print("Agent names: ", agent_names)
    print("Your chosen agents: left - {}, right - {}".format(
        args.left, args.right))

    assert args.left in agent_names, agent_names
    assert args.right in agent_names, agent_names

    env = make_envs("cPongDouble-v0", num_envs=1, asynchronous=False).envs[0]

    if args.left != "MY_AGENT":
        left = get_compute_action_function(args.left)
    else:
        left = student_compute_action_function()
    if args.right != "MY_AGENT":
        right = get_compute_action_function(args.right)
    else:
        right = student_compute_action_function()

    result = evaluate_two_policies(
        left,
        right,
        env=env,
        render=True,
                        "-N",
                        default=10,
                        type=int,
                        help="Number of episodes to run.")
    args = parser.parse_args()

    agent_names = get_builtin_agent_names() + ["MY_AGENT"]

    print("Agent names: ", agent_names)
    print("Your chosen agents: left - {}, right - {}".format(
        args.left, args.right))

    assert args.left in agent_names, agent_names
    assert args.right in agent_names, agent_names

    env = make_envs("CompetitivePongDouble-v0", num_envs=1,
                    asynchronous=False).envs[0]

    if args.left != "MY_AGENT":
        left = get_compute_action_function(args.left)
    else:
        left = student_compute_action_function()
    if args.right != "MY_AGENT":
        right = get_compute_action_function(args.right)
    else:
        right = student_compute_action_function()

    result = evaluate_two_policies(
        left,
        right,
        env=env,
        render=True,
        function_name for function_name in dir(this_is_my_agent)
        if function_name.startswith("student")
    ]
    student_functions = {}
    for f in student_function_names:
        studnet_policy_creator = this_is_my_agent.__dict__[f]
        studnet_id = f.split("student_")[-1]
        student_functions[studnet_id] = studnet_policy_creator(num_envs)
    print("Collected policies: ", student_functions.keys())

    # Merge builtin agent with students' agents
    for name in get_builtin_agent_names():
        student_functions[name] = get_compute_action_function(name, num_envs)

    # ===== Setup environment =====
    envs = make_envs(
        "CompetitivePongDouble-v0", num_envs=num_envs, asynchronous=True)
    print("Environment ready")

    # ===== Run Matches =====
    visited_agent = set()
    result_list = []
    for name, policy in student_functions.items():
        # Remove repeat agents
        opponent_functions = student_functions.copy()
        for opponent in visited_agent:
            opponent_functions.pop(opponent)

        print("Start match between agent {} with {}.".format(
            name, opponent_functions.keys()
        ))
Exemple #5
0
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    elif algo == "A2C":
        config = a2c_config
    else:
        raise ValueError("args.algo must in [PPO, A2C]")
    config.num_envs = args.num_envs
    assert args.env_id in [
        "CompetitivePong-v0", "CartPole-v0", "CompetitivePongTournament-v0"
    ]

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir(args.log_dir, algo)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(env_id=env_id,
                     seed=seed,
                     log_dir=log_dir,
                     num_envs=num_envs,
                     asynchronous=True,
                     resized_dim=config.resized_dim)
    eval_envs = make_envs(env_id=env_id,
                          seed=seed,
                          log_dir=log_dir,
                          num_envs=num_envs,
                          asynchronous=False,
                          resized_dim=config.resized_dim)
    test = env_id == "CartPole-v0"
    tournament = env_id == "CompetitivePongTournament-v0"
    frame_stack = 4 if not test else 1
    if tournament:
        assert algo == "PPO", "Using PPO in tournament is a good idea, " \
                              "because of its efficiency compared to A2C."

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, frame_stack, _test=test)
    else:
        trainer = A2CTrainer(envs, config, frame_stack, _test=test)

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(num_envs,
                                          envs.observation_space.shape,
                                          frame_stack, config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(frame_stack_tensor.get())
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        with sample_timer:
            for index in range(config.num_steps):
                # Get action
                # [TODO] Get the action
                # Hint:
                #   1. Remember to disable gradient computing
                #   2. trainer.rollouts is a storage containing all data
                #   3. What observation is needed for trainer.compute_action?
                values = None
                actions = None
                action_log_prob = None
                pass

                cpu_actions = actions.view(-1).cpu().numpy()

                # Step the environment
                # (Check step_envs function, you need to implement it)
                obs, reward, done, info, masks, total_episodes, \
                total_steps, episode_rewards = step_envs(
                    cpu_actions, envs, episode_rewards, frame_stack_tensor,
                    reward_recorder, episode_length_recorder, total_steps,
                    total_episodes, config.device, test)

                rewards = torch.from_numpy(reward.astype(np.float32)).view(
                    -1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(frame_stack_tensor.get(),
                                        actions.view(-1, 1), action_log_prob,
                                        values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = \
                trainer.update(trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Reset opponent if in tournament mode =====
        if tournament and iteration % config.num_steps == 0:
            # Randomly choose one agent in each iteration
            envs.reset_opponent()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths = evaluate(
                trainer, eval_envs, frame_stack, 20)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(
                dict(win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) /
                    len(evaluate_rewards)),
                     evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(policy_loss=policy_loss,
                                    entropy=dist_entropy,
                                    value_loss=value_loss,
                                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            if tournament:
                stats["opponent"] = envs.current_agent_name

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()