def __init__(self, num_envs=1, log_dir="", suffix=""):
     self.resized_dim = 42
     env = make_envs(num_envs=1, resized_dim=self.resized_dim)
     self.obs_shape = env.observation_space.shape
     self.agent = PPOTrainer(env, ppo_config)
     if log_dir:  # log_dir is None only in testing
         self.agent.load_w(log_dir, suffix)
     self.num_envs = num_envs
     self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4,
                                         self.agent.device)
Beispiel #2
0
def test():
    # Run this function to make sure your API is runnable
    policy_names = [
        function_name for function_name in locals()
        if function_name.startswith("my_policy")
    ]
    assert len(
        policy_names) == 1, "Found {}, the potential  policies {}".format(
            dir(), policy_names)
    policy_name = policy_names[0]
    policy_creator = locals()[policy_name]

    num_envs = 1
    policy = policy_creator(num_envs)
    env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False)
    o = env.reset()
    for i in range(1000):
        a = policy(o)
        assert np.asarray(a).shape == (num_envs, 2)
        assert env.action_space.contains(a[0])
        o, _, d, _ = env.step(a)
        if d:
            o = env.reset()
    env.close()

    num_envs = 3
    policy = policy_creator(num_envs)
    env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False)
    o = env.reset()
    for i in range(1000):
        a = policy(o)
        assert np.asarray(a).shape == (num_envs, 2)
        o, _, d, _ = env.step(a)
        if d:
            o = env.reset()
    env.close()

    print("Test passed!")
Beispiel #3
0
def generate_data(rollouts, data_dir, noise_type):  # pylint: disable=R0914
    """ Generates data """
    assert exists(data_dir), "The data directory does not exist..."
    from competitive_rl import make_envs

    # env = gym.make("CarRacing-v0")
    env = make_envs(env_id='cCarRacing-v0',
                    seed=100,
                    log_dir='data/dataset/',
                    num_envs=1,
                    asynchronous=False,
                    resized_dim=96,
                    action_repeat=1)

    seq_len = 1000

    for i in range(rollouts):
        env.reset()
        a_rollout = sample_continuous_policy(env.action_space, seq_len,
                                             1. / 50)

        s_rollout = []
        r_rollout = []
        d_rollout = []

        t = 0
        while True:
            action = a_rollout[t]
            t += 1
            # import pdb; pdb.set_trace()
            obs, r, done, _ = env.step(action.reshape(1, -1))
            # for ii in range(4):
            #     plt.figure()
            #     _obs = obs[0,ii,...]
            #     plt.imshow(_obs, cmap='gray', vmin=0, vmax=255)
            #     plt.savefig(f'./vis_env/{t}_{ii}.png')

            s_rollout += [obs[0]]
            r_rollout += [r]
            d_rollout += [done]
            if done:
                print("> End of rollout {}, {} frames...".format(
                    i, len(s_rollout)))
                np.savez(join(data_dir, 'rollout_{}'.format(i)),
                         observations=np.array(s_rollout),
                         rewards=np.array(r_rollout),
                         actions=np.array(a_rollout),
                         terminals=np.array(d_rollout))
                break
Beispiel #4
0
 def __init__(self,
              env_id,
              num_envs=1,
              log_dir=None,
              suffix=None,
              _test=False):
     # self.resized_dim = 42
     env = make_envs(env_id=env_id, num_envs=1)
     self.obs_shape = env.observation_space.shape
     self.agent = PPOTrainer(env, ppo_config)
     if log_dir is not None:  # log_dir is None only in testing
         success = self.agent.load_w(log_dir, suffix)
         if not success and not _test:
             raise ValueError("Failed to load agent!")
     self.num_envs = num_envs
Beispiel #5
0
    # test()

    # Run this function to make sure your API is runnable
    policy_names = [
        function_name for function_name in locals()
        if function_name.startswith("my_policy")
    ]
    assert len(
        policy_names) == 1, "Found {}, the potential  policies {}".format(
            dir(), policy_names)
    policy_name = policy_names[0]
    policy_creator = locals()[policy_name]

    num_envs = 1
    policy = policy_creator(num_envs)
    env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False)
    o = env.reset()
    for i in range(1000):
        a = [policy(o)]
        o, _, d, _ = env.step(a)
        if d:
            o = env.reset()
    env.close()

    num_envs = 3
    policy = policy_creator(num_envs)
    env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False)
    o = env.reset()
    for i in range(1000):
        a = policy(o)
        o, _, d, _ = env.step(a)
Beispiel #6
0
from competitive_rl import make_envs

if __name__ == '__main__':
    envs = make_envs(
        env_id="cCarRacing-v0",
        seed=0,
        log_dir="demo",  # this will create a "demo" directory
        num_envs=5,
        asynchronous=True,
        resized_dim=42)
    obs = envs.reset()
    print(obs.shape)
    envs.close()
                        type=int,
                        help="The number of episodes to run. Default: 100")
    args = parser.parse_args()

    num_episodes = args.num_episodes
    num_envs = args.num_envs

    agents = {
        l: get_compute_action_function(l, num_envs)
        for l in get_builtin_agent_names()
    }
    agents["MY_AGENT"] = student_compute_action_function(num_envs)

    print("All agents ready: ", agents.keys())

    envs = make_envs("cPongDouble-v0", num_envs=num_envs, asynchronous=True)
    print("Environment ready")

    result = launch("MY_AGENT", student_compute_action_function(num_envs),
                    agents, envs, num_episodes)

    winning_rate_matrix, reward_matrix = build_matrix(result, single_line=True)
    print("\n===== Winning Rate Matrix (row vs column) =====")
    print(winning_rate_matrix)
    print("\n===== Reward Matrix (row vs column) =====")
    print(reward_matrix)

    with open("data/evaluate_result.md", "w") as f:
        f.write("winning rate matrix:\n\n")
        f.write(
            tabulate.tabulate(winning_rate_matrix,
Beispiel #8
0
                        default=3,
                        type=int,
                        help="Number of episodes to run.")
    args = parser.parse_args()

    # collect builtin agents
    agent_names = get_builtin_agent_names() + ["MY_AGENT"]
    print("Agent names: ", agent_names)
    print("Your chosen agents: left - {}, right - {}".format(
        args.left, args.right))
    assert args.left in agent_names, agent_names
    assert args.right in agent_names, agent_names

    # create env and setup policies
    env = make_envs("cPongDouble-v0",
                    num_envs=1,
                    asynchronous=False,
                    log_dir="tmp_vis").envs[0]
    left = get_compute_action_function(args.left)
    right = get_compute_action_function(args.right)

    # evaluate
    result = evaluate_two_policies(
        left,
        right,
        env=env,
        render=True,
        num_episode=args.num_episodes,
        render_interval=0.05  # 20 FPS rendering
    )
    print(result)
 def __init__(self, crop=True, grass_penalty=0, action_repeat=1):
     comp_envs = make_envs("cCarRacing-v0", num_envs=1, action_repeat=1)
     self.env = comp_envs.envs[0]
     super().__init__(crop, grass_penalty, action_repeat)
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    elif algo == "A2C":
        config = a2c_config
    else:
        raise ValueError("args.algo must in [PPO, A2C]")
    config.num_envs = args.num_envs
    assert args.env_id in ["cPong-v0", "CartPole-v0",
                           "cPongTournament-v0"]

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir(args.log_dir, algo)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
        resized_dim=config.resized_dim
    )
    eval_envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=False,
        resized_dim=config.resized_dim
    )
    test = env_id == "CartPole-v0"
    tournament = env_id == "cPongTournament-v0"
    frame_stack = 4 if not test else 1
    if tournament:
        assert algo == "PPO", "Using PPO in tournament is a good idea, " \
                              "because of its efficiency compared to A2C."

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, frame_stack, _test=test)
    else:
        trainer = A2CTrainer(envs, config, frame_stack, _test=test)

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(
        num_envs, envs.observation_space.shape, frame_stack, config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(frame_stack_tensor.get())
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        with sample_timer:
            for index in range(config.num_steps):
                # Get action
                # [TODO] Get the action
                # Hint:
                #   1. Remember to disable gradient computing
                #   2. trainer.rollouts is a storage containing all data
                #   3. What observation is needed for trainer.compute_action?
                with torch.no_grad():
                    values, actions, action_log_prob = trainer.compute_action(trainer.rollouts.observations[index])
                cpu_actions = actions.view(-1).cpu().numpy()

                # Step the environment
                # (Check step_envs function, you need to implement it)
                obs, reward, done, info, masks, total_episodes, \
                total_steps, episode_rewards = step_envs(
                    cpu_actions, envs, episode_rewards, frame_stack_tensor,
                    reward_recorder, episode_length_recorder, total_steps,
                    total_episodes, config.device, test)

                rewards = torch.from_numpy(
                    reward.astype(np.float32)).view(-1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(
                    frame_stack_tensor.get(), actions.view(-1, 1),
                    action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = \
                trainer.update(trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Reset opponent if in tournament mode =====
        if tournament and iteration % config.num_steps == 0:
            # Randomly choose one agent in each iteration
            envs.reset_opponent()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths = evaluate(
                trainer, eval_envs, frame_stack, 20)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(dict(
                win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) / len(
                        evaluate_rewards)),
                evaluate_time=eval_timer.now,
                evaluate_iteration=iteration
            ))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(
                    policy_loss=policy_loss,
                    entropy=dist_entropy,
                    value_loss=value_loss,
                    total_loss=total_loss
                ),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(
                    sample_time=sample_timer.avg,
                    process_time=process_timer.avg,
                    update_time=update_timer.avg,
                    total_time=total_timer.now,
                    episode_time=sample_timer.avg + process_timer.avg +
                                 update_timer.avg
                ),
                iteration=iteration
            )

            if tournament:
                stats["opponent"] = envs.current_agent_name

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(
                    algo, iteration): stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print("Saved trainer state at <{}>. Saved progress at <{}>.".format(
                trainer_path, progress_path
            ))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
Beispiel #11
0
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    config.lr = args.lr
    config.entropy_loss_weight = args.entropy
    assert args.env_id in ["cPong-v0", "cCarRacing-v0"], args.env_id

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id if not args.opponent else "cCarRacingDouble-v0"

    # Clean log directory
    log_dir = verify_log_dir(
        args.log_dir,
        "{}_{}_{}".format(env_id, algo,
                          datetime.datetime.now().strftime("%m-%d_%H-%M")))

    if args.opponent:
        assert args.num_eval_envs == 0

        from competitive_rl.car_racing import make_competitive_car_racing
        from load_agents import PolicyAPI

        restore_log_dir = os.path.dirname(args.restore)
        restore_suffix = os.path.basename(
            args.restore).split("checkpoint-")[1].split(".pkl")[0]
        opponent_policy = PolicyAPI("cCarRacing-v0",
                                    num_envs=1,
                                    log_dir=restore_log_dir,
                                    suffix=restore_suffix)
        envs = make_competitive_car_racing(opponent_policy=opponent_policy,
                                           num_envs=num_envs,
                                           asynchronous=not args.test)
    else:
        envs = make_envs(env_id=env_id,
                         seed=seed,
                         log_dir=log_dir,
                         num_envs=num_envs,
                         asynchronous=not args.test,
                         resized_dim=config.resized_dim,
                         action_repeat=args.action_repeat)

    if args.num_eval_envs > 0:
        eval_envs = make_envs(env_id=env_id,
                              seed=seed,
                              log_dir=log_dir,
                              num_envs=args.num_eval_envs,
                              asynchronous=not args.test,
                              resized_dim=config.resized_dim,
                              action_repeat=args.action_repeat)
    else:
        eval_envs = None

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config)
    else:
        raise ValueError("Unknown algorithm {}".format(algo))

    if args.restore:
        restore_log_dir = os.path.dirname(args.restore)
        restore_suffix = os.path.basename(
            args.restore).split("checkpoint-")[1].split(".pkl")[0]
        success = trainer.load_w(restore_log_dir, restore_suffix)
        if not success:
            raise ValueError(
                "We can't restore your agent. The log_dir is {} and the suffix is {}"
                .format(restore_log_dir, restore_suffix))

    # Start training
    print("Start training!")
    obs = envs.reset()
    # frame_stack_tensor.update(obs)
    raw_obs = trainer.process_obs(obs)
    processed_obs = trainer.model.world_model(raw_obs)
    trainer.rollouts.before_update(obs, processed_obs)

    try:
        _train(trainer, envs, eval_envs, config, num_envs, algo, log_dir,
               False, False)
    except KeyboardInterrupt:
        print(
            "The training is stopped by user. The log directory is {}. Now we finish the training."
            .format(log_dir))

    trainer.save_w(log_dir, "final")
    envs.close()
Beispiel #12
0
def test_base_trainer():
    from competitive_rl import make_envs

    class FakeConfig:
        def __init__(self):
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            self.num_envs = 1
            self.num_steps = 200
            self.gamma = 0.99
            self.lr = 5e-4

    class FakeTrainer(BaseTrainer):
        def setup_optimizer(self):
            pass

        def setup_rollouts(self):
            pass

    # ===== Discrete case =====
    env = make_envs("cPong-v0", asynchronous=False, num_envs=3)
    trainer = FakeTrainer(env, FakeConfig())
    obs = env.reset()
    # Input single observation
    values, actions, action_log_probs = trainer.compute_action(
        obs[0], deterministic=True)
    new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions(
        obs[0], actions)
    assert actions.shape == (1, 1), actions.shape
    assert values.shape == (1, 1), values.shape
    assert action_log_probs.shape == (1, 1), action_log_probs.shape
    assert dist_entropy.shape == ()
    assert (values == new_values).all()
    assert (action_log_probs == new_action_log_probs).all()
    assert dist_entropy.shape == ()

    # Input multiple observations
    values, actions, action_log_probs = trainer.compute_action(
        obs, deterministic=False)
    new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions(
        obs, actions)
    assert actions.shape == (3, 1), actions.shape
    assert values.shape == (3, 1), values.shape
    assert action_log_probs.shape == (3, 1), action_log_probs.shape
    assert dist_entropy.shape == ()
    assert (values == new_values).all()
    assert (action_log_probs == new_action_log_probs).all()
    assert dist_entropy.shape == ()

    print("Base trainer discrete case test passed!")
    env.close()

    # ===== Continuous case =====
    env = make_envs("cCarRacing-v0", asynchronous=False, num_envs=3)
    trainer = FakeTrainer(env, FakeConfig())
    obs = env.reset()
    # Input single observation
    values, actions, action_log_probs = trainer.compute_action(
        obs[0], deterministic=True)
    new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions(
        obs[0], actions)
    assert actions.shape == (1, 2), actions.shape
    assert values.shape == (1, 1), values.shape
    assert action_log_probs.shape == (1, 1), action_log_probs.shape
    assert dist_entropy.shape == ()
    assert (values == new_values).all()
    assert (action_log_probs == new_action_log_probs).all()
    assert dist_entropy.shape == ()

    # Input multiple observations
    values, actions, action_log_probs = trainer.compute_action(
        obs, deterministic=False)
    new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions(
        obs, actions)
    assert actions.shape == (3, 2), actions.shape
    assert values.shape == (3, 1), values.shape
    assert action_log_probs.shape == (3, 1), action_log_probs.shape
    assert dist_entropy.shape == ()
    assert (values == new_values).all()
    assert (action_log_probs == new_action_log_probs).all()
    assert dist_entropy.shape == ()

    print("Base trainer continuous case test passed!")
    env.close()
Beispiel #13
0
    # ===== Load student policies =====
    student_function_names = [
        function_name for function_name in dir(my_policy) if function_name.startswith("my_policy")
    ]
    assert student_function_names
    student_functions = {}
    for f in student_function_names:
        studnet_policy_creator = my_policy.__dict__[f]
        studnet_id = f.split("my_policy_")[-1]
        student_functions[studnet_id] = studnet_policy_creator(num_envs)
    print("Collected policies: ", student_functions.keys())

    # ===== Setup environment =====
    # envs = make_envs("CompetitivePongDouble-v0", num_envs=num_envs, asynchronous=True)
    seed = np.random.randint(10000)
    envs = make_envs("cCarRacingDouble-v0", num_envs=num_envs, asynchronous=True, seed=seed)
    print("Environment ready")

    # ===== Run Matches =====
    visited_agent = set()
    result_list = []
    for name, policy in student_functions.items():
        # Remove repeat agents
        opponent_functions = student_functions.copy()
        for opponent in visited_agent:
            opponent_functions.pop(opponent)

        print("Start match between agent {} with {}.".format(name, opponent_functions.keys()))

        result = launch(name, policy, opponent_functions, envs, num_episodes)
        result_list.append(result)
Beispiel #14
0
from competitive_rl import make_envs

if __name__ == '__main__':
    envs = make_envs(
        env_id="cPong-v0",
        seed=0,
        log_dir="demo",  # this will create a "demo" directory
        num_envs=1,
        asynchronous=False,
        resized_dim=42
    )

    env = envs.envs[0]

    obs = envs.reset()
    env.close()
    print(obs.shape)
    # envs.close()
                        "-N",
                        default=10,
                        type=int,
                        help="Number of episodes to run.")
    args = parser.parse_args()

    agent_names = get_builtin_agent_names() + ["MY_AGENT"]

    print("Agent names: ", agent_names)
    print("Your chosen agents: left - {}, right - {}".format(
        args.left, args.right))

    assert args.left in agent_names, agent_names
    assert args.right in agent_names, agent_names

    env = make_envs("cPongDouble-v0", num_envs=1, asynchronous=False).envs[0]

    if args.left != "MY_AGENT":
        left = get_compute_action_function(args.left)
    else:
        left = student_compute_action_function()
    if args.right != "MY_AGENT":
        right = get_compute_action_function(args.right)
    else:
        right = student_compute_action_function()

    result = evaluate_two_policies(
        left,
        right,
        env=env,
        render=False,
Beispiel #16
0
 def __init__(self):
     # self.env = gym.make('CarRacing-v0')
     comp_envs = make_envs('cCarRacing-v0', num_envs=1, action_repeat=1)
     self.env = comp_envs.envs[0]
     self.env.seed(args.seed)
     self.reward_threshold = 910