Exemple #1
0
 def __init__(self, num_envs=1, log_dir="", suffix=""):
     self.resized_dim = 42
     env = make_envs(num_envs=1, resized_dim=self.resized_dim)
     self.obs_shape = env.observation_space.shape
     self.agent = PPOTrainer(env, ppo_config)
     if log_dir:  # log_dir is None only in testing
         self.agent.load_w(log_dir, suffix)
     self.num_envs = num_envs
     self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4,
                                         self.agent.device)
Exemple #2
0
class PolicyAPI:
    """
    This class wrap an agent into a callable function that return action given
    an raw observation or a batch of raw observations from environment.

    This function maintain a frame stacker so that the user can securely use it.
    A reset function is provided so user can refresh the frame stacker when
    an episode is ended.

    Note that if you have implement other arbitrary custom agent, you are
    welcomed to implement a function-like API by yourself. You can write
    another API function or class and replace this one used in evaluation or
    even training.

    Your custom agent may have different network structure and different
    preprocess techniques. Remember that the API take the raw observation with
    shape (num_envs, 1, 42, 42) as input and return an single or a batch of
    integer(s) as action in [0, 1, 2]. Custom agent worth plenty of extra
    credits!
    """
    def __init__(self, num_envs=1, log_dir="", suffix=""):
        self.resized_dim = 42
        env = make_envs(num_envs=1, resized_dim=self.resized_dim)
        self.obs_shape = env.observation_space.shape
        self.agent = PPOTrainer(env, ppo_config)
        if log_dir:  # log_dir is None only in testing
            self.agent.load_w(log_dir, suffix)
        self.num_envs = num_envs
        self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4,
                                            self.agent.device)

    def reset(self):
        # A potential bug is that, the frame stack is not properly reset in
        # a vectorized environment. We assume this will not impact the
        # performance significantly.
        self.frame_stack.reset()

    def __call__(self, obs):
        self.frame_stack.update(obs)
        action = self.agent.compute_action(self.frame_stack.get(), True)[1]
        if self.num_envs == 1:
            action = action.item()
        else:
            action = action.cpu().numpy()
        return action
Exemple #3
0
def train(args):
    # Verify algorithm and config
    global env_options, trainer_options
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    if args.envopt is not None:
        f = open(args.envopt)
        env_options = json.load(f)
    if args.trainopt is not None:
        f = open(args.trainopt)
        trainer_options = json.load(f)
    if args.opt is not None:
        opt = json.load(open(args.opt))
        env_options = opt['env']
        trainer_options = opt['trainer']

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir('work_dirs', args.log_dir)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
        options=env_options,
    )

    if env_id == "Walker2d-v3":
        healthy_z_range = (0.8, 2.0)
    elif env_id == 'Humanoid-v3':
        healthy_z_range = (1.0, 2.0)
    if 'healthy_z_range' in env_options:
        healthy_z_range = env_options['healthy_z_range']
    eval_env = gym.make(env_id,
                        healthy_z_range=healthy_z_range,
                        healthy_reward=0)
    if env_id == "Walker2d-v3":
        eval_env = Walker2d_wrapper(eval_env, env_options)

    obs_dim = envs.observation_space.shape[0]
    act_dim = envs.action_space.shape[0]
    real_obs_dim = obs_dim
    real_act_dim = act_dim
    if 'real_obs_dim' in trainer_options:
        real_obs_dim = trainer_options['real_obs_dim']
    if 'real_act_dim' in trainer_options:
        real_act_dim = trainer_options['real_act_dim']
    dim_dict = dict(obs_dim=obs_dim,
                    act_dim=act_dim,
                    real_obs_dim=real_obs_dim,
                    real_act_dim=real_act_dim)

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, trainer_options)
    else:
        raise NotImplementedError

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(num_envs,
                                          envs.observation_space.shape,
                                          config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(
        reduce_shape(frame_stack_tensor.get(), real_obs_dim))
    while True:  # Break when total_steps exceeds maximum value
        with sample_timer:
            for index in range(config.num_steps):

                trainer.model.eval()
                values, actions, action_log_prob = trainer.model.step(
                    reduce_shape(frame_stack_tensor.get(), real_obs_dim))

                cpu_actions = actions.cpu().numpy()
                cpu_actions = enlarge_shape(cpu_actions, act_dim)

                obs, reward, done, info, masks, total_episodes, \
                    total_steps, episode_rewards = step_envs(
                        cpu_actions, envs, episode_rewards, frame_stack_tensor,
                        reward_recorder, episode_length_recorder, total_steps,
                        total_episodes, config.device)

                rewards = torch.from_numpy(reward.astype(np.float32)).view(
                    -1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(
                    reduce_shape(frame_stack_tensor.get(), real_obs_dim),
                    actions, action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        trainer.model.train()
        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, total_loss = trainer.update(
                trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict)
            evaluate_stat = summary(rewards, "episode_reward")
            evaluate_stat.update(summary(eplens, "episode_length"))
            evaluate_stat.update(
                dict(evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(policy_loss=policy_loss,
                                    value_loss=value_loss,
                                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
def train(args):
    # Verify algorithm and config
    global env_options, trainer_options
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    config.activation = nn.ReLU
    if args.trainopt is not None:
        f = open(args.trainopt)
        trainer_options = json.load(f)
    if args.opt is not None:
        opt = json.load(open(args.opt))
        env_options = opt['env']
        trainer_options = opt['trainer']

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir('work_dirs', args.log_dir)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id

    main_envs = make_envs(
        env_id='Humanoid-v3',
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
    )

    aux_envs = make_envs(
        env_id='Walker2d-v3',
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
    )

    envs = [main_envs, aux_envs]

    # eval_env is main_env
    healthy_z_range = (1.0, 2.0)
    eval_env = gym.make(env_id,
                        healthy_z_range=healthy_z_range,
                        healthy_reward=0)

    main_obs_dim = 376
    main_act_dim = 17
    main_reduce_obs_dim = 46
    main_reduce_act_dim = 11
    aux_obs_dim = 17
    aux_act_dim = 6

    obs_dims = [main_reduce_obs_dim, aux_obs_dim]
    act_dims = [main_act_dim, aux_act_dim]

    dim_dict = dict(obs_a=main_reduce_obs_dim,
                    act_a=main_reduce_act_dim,
                    obs_b=aux_obs_dim,
                    act_b=aux_act_dim,
                    coeff_a=0.4,
                    coeff_b=1)
    dim_dict['act_dim'] = 17
    dim_dict['real_obs_dim'] = 46

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainerMTMT(config, dim_dict)
    else:
        raise NotImplementedError

    frame_stack_tensors = [
        FrameStackTensor(num_envs, main_envs.observation_space.shape,
                         config.device),
        FrameStackTensor(num_envs, aux_envs.observation_space.shape,
                         config.device)
    ]

    # Setup some stats helpers
    episode_rewards = [
        np.zeros([num_envs, 1], dtype=np.float),
        np.zeros([num_envs, 1], dtype=np.float)
    ]

    total_episodes = total_steps = iteration = 0

    reward_recorders = [deque(maxlen=100), deque(maxlen=100)]
    episode_length_recorders = [deque(maxlen=100), deque(maxlen=100)]

    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = [envs[i].reset() for i in range(2)]
    _ = [frame_stack_tensors[i].update(obs[i]) for i in range(2)]

    # first update
    for i in range(2):
        trainer.rollouts[i].observations[0].copy_(
            reduce_shape(frame_stack_tensors[i].get(), obs_dims[i]))

    branch_names = ['a', 'b']

    while True:  # Break when total_steps exceeds maximum value
        with sample_timer:
            # prepare rollout a
            for ind in range(2):
                for index in range(config.num_steps):
                    trainer.model.eval()
                    values, actions, action_log_prob = trainer.model.step(
                        reduce_shape(frame_stack_tensors[ind].get(),
                                     obs_dims[ind]),
                        deterministic=False,
                        branch=branch_names[ind])
                    cpu_actions = actions.cpu().numpy()
                    cpu_actions = enlarge_shape(cpu_actions, act_dims[ind])

                    # obs, done, info not needed, we have masks & obs in frame_stack_tensors
                    _, reward, _, _, masks, new_total_episodes, new_total_steps, episode_rewards[ind] = \
                        step_envs(cpu_actions, envs[ind], episode_rewards[ind], frame_stack_tensors[ind],
                                  reward_recorders[ind], episode_length_recorders[ind],
                                  total_steps, total_episodes, config.device)

                    if ind == 0:
                        total_episodes = new_total_episodes
                        total_steps = new_total_steps

                    rewards = torch.from_numpy(reward.astype(np.float32)).view(
                        -1, 1).to(config.device)

                    trainer.rollouts[ind].insert(
                        reduce_shape(frame_stack_tensors[ind].get(),
                                     obs_dims[ind]), actions, action_log_prob,
                        values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                for i in range(2):
                    next_value = trainer.compute_values(
                        trainer.rollouts[i].observations[-1], branch_names[i])
                    trainer.rollouts[i].compute_returns(
                        next_value, config.GAMMA)

        trainer.model.train()
        # ===== Update Policy =====
        with update_timer:
            losses = trainer.update(trainer.rollouts[0], trainer.rollouts[1])
            policy_loss, value_loss, total_loss = list(zip(*losses))
            trainer.rollouts[0].after_update()
            trainer.rollouts[1].after_update()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            # seems ok, by default model is dealing with task1
            rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict)
            evaluate_stat = summary(rewards, "episode_reward")
            evaluate_stat.update(summary(eplens, "episode_length"))
            evaluate_stat.update(
                dict(evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward_a=summary(reward_recorders[0],
                                                  "episode_reward"),
                training_episode_length_a=summary(episode_length_recorders[0],
                                                  "episode_length"),
                training_episode_reward_b=summary(reward_recorders[1],
                                                  "episode_reward"),
                training_episode_length_b=summary(episode_length_recorders[1],
                                                  "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats_a=dict(policy_loss=policy_loss[0],
                                      value_loss=value_loss[0],
                                      total_loss=total_loss[0]),
                learning_stats_b=dict(policy_loss=policy_loss[1],
                                      value_loss=value_loss[1],
                                      total_loss=total_loss[1]),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
Exemple #5
0
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    elif algo == "A2C":
        config = a2c_config
    else:
        raise ValueError("args.algo must in [PPO, A2C]")
    config.num_envs = args.num_envs

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir(args.log_dir, algo)

    # Create vectorized environments
    num_envs = args.num_envs
    env_name = args.env_name

    # Prepare tensorboard file
    args.save_log = 'Pairtrding-{}'.format(time.strftime("%Y%m%d-%H%M%S"))
    generate_date = str(datetime.now().date())

    writer = SummaryWriter(args.log_dir + '/runs/' + generate_date + '/' +
                           args.save_log)

    # download stock price data from yahoo finance
    stocklist = [
        '0700.hk', '2318.hk', '3988.hk', '0998.hk', '1398.hk', '3968.hk',
        '0981.hk', '0005.hk'
    ]
    # 腾讯,平安,中银,中信,工商,招商,中芯国际,汇丰
    stocktickers = ' '.join(stocklist)

    data = yf.download(tickers=stocktickers,
                       start="2010-01-01",
                       end="2019-12-31")
    data = data['Close']
    columnchange = []
    for stock in data.columns:
        name = stock + 'change'
        columnchange.append(name)
        data[name] = data[stock] - data[stock].shift(1)

    CorrDict = {}
    for i in columnchange:
        for j in columnchange:
            if i != j and (i, j) not in CorrDict:
                CorrDict[(i, j)] = data[i].corr(data[j])
    pair = list(max(CorrDict))
    pair.append(pair[0][:7])
    pair.append(pair[1][:7])
    dataremain = data[pair]

    from sklearn import linear_model
    import numpy as np
    model = linear_model.LinearRegression()
    model.fit(dataremain[pair[0]][1:-250].to_numpy().reshape(-1, 1),
              y=dataremain[pair[1]][1:-250])
    beta = model.coef_[0]

    dataremain['Spread'] = beta * data[pair[0]] - data[pair[1]]
    Spreadmean = dataremain['Spread'].mean()
    Spreadstd = dataremain['Spread'].std()
    dataremain['Z-score'] = (dataremain['Spread'] - Spreadmean) / Spreadstd

    envs = PairtradingEnv(stock1=dataremain[pair[2]][:-250],
                          stock2=dataremain[pair[3]][:-250])
    eval_envs = PairtradingEnv(stock1=dataremain[pair[2]][-250:],
                               stock2=dataremain[pair[3]][-250:])

    baseline_config = baselineConfig(mean=Spreadmean, std=Spreadstd, beta=beta)
    baseline_trainer = baseline(env=envs, config=baseline_config)

    baseline_eval = baseline(env=eval_envs, config=baseline_config)

    test = env_name == "CartPole-v0"
    frame_stack = args.input_length if not test else 1

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, frame_stack, _test=test)
    else:
        trainer = A2CTrainer(envs, config, frame_stack, _test=test)

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(
        num_envs, envs.observation_space.shape, frame_stack,
        config.device)  # envs.observation_space.shape: 1,42,42

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    episode_values = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        # episode_values = []
        episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
        for env_id in range(num_envs):
            obs = envs.reset()  # obs.shape: 15,1,42,42
            frame_stack_tensor.update(obs, env_id)
            trainer.rollouts.observations[0, env_id].copy_(
                frame_stack_tensor.get(env_id)
            )  #trainer.rollouts.observations.shape: torch.Size([201, 15, 4, 42, 42])

            with sample_timer:
                for index in range(config.num_steps):
                    # Get action
                    # [TODO] Get the action
                    # Hint:
                    #   1. Remember to disable gradient computing
                    #   2. trainer.rollouts is a storage containing all data
                    #   3. What observation is needed for trainer.compute_action?
                    with torch.no_grad():
                        values, actions_cash, action_log_prob_cash, actions_beta, action_log_prob_beta = trainer.compute_action(
                            trainer.rollouts.observations[index, env_id])

                    act = baseline_trainer.compute_action(
                        actions_cash.view(-1), actions_beta.view(-1))

                    cpu_actions = act

                    # Step the environment
                    # (Check step_envs function, you need to implement it)
                    obs, reward, done, masks, total_episodes, \
                    total_steps, episode_rewards, episode_values = step_envs(
                        cpu_actions, envs, env_id, episode_rewards, episode_values, frame_stack_tensor,
                        reward_recorder, episode_length_recorder, total_steps,
                        total_episodes, config.device, test)

                    rewards = torch.from_numpy(
                        np.array(reward).astype(np.float32)).view(-1).to(
                            config.device)
                    # Store samples
                    trainer.rollouts.insert(frame_stack_tensor.get(env_id),
                                            actions_cash.view(-1),
                                            action_log_prob_cash.view(-1),
                                            actions_beta.view(-1),
                                            action_log_prob_beta.view(-1),
                                            values.view(-1), rewards,
                                            masks.view(-1), env_id)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = \
                trainer.update(trainer.rollouts)
            trainer.rollouts.after_update()

            # Add training statistics to tensorboard log file
            writer.add_scalar('train_policy_loss', policy_loss, iteration)
            writer.add_scalar('train_value_loss', value_loss, iteration)
            writer.add_scalar('train_dist_entropy', dist_entropy, iteration)
            writer.add_scalar('train_total_loss', total_loss, iteration)
            writer.add_scalar('train_episode_rewards',
                              np.mean(episode_rewards), iteration)
            writer.add_scalar('train_episode_values',
                              np.array(episode_values).mean(), iteration)

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths, evaluate_values = evaluate(
                trainer, eval_envs, baseline_eval, frame_stack, 5)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(
                dict(win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) /
                    len(evaluate_rewards)),
                     evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration,
                     evaluate_values=float(np.array(evaluate_values).mean())))

            # Add evaluation statistics to tensorboard log file
            writer.add_scalar('eval_episode_rewards',
                              np.array(evaluate_rewards).mean(),
                              iteration // config.eval_freq)
            writer.add_scalar('eval_episode_values',
                              np.array(evaluate_values).mean(),
                              iteration // config.eval_freq)

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_values=summary(episode_values,
                                                "episode_value"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(policy_loss=policy_loss,
                                    entropy=dist_entropy,
                                    value_loss=value_loss,
                                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        if iteration >= args.max_steps:
            break

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    elif algo == "A2C":
        config = a2c_config
    else:
        raise ValueError("args.algo must in [PPO, A2C]")
    config.num_envs = args.num_envs
    assert args.env_id in ["cPong-v0", "CartPole-v0",
                           "cPongTournament-v0"]

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir(args.log_dir, algo)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
        resized_dim=config.resized_dim
    )
    eval_envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=False,
        resized_dim=config.resized_dim
    )
    test = env_id == "CartPole-v0"
    tournament = env_id == "cPongTournament-v0"
    frame_stack = 4 if not test else 1
    if tournament:
        assert algo == "PPO", "Using PPO in tournament is a good idea, " \
                              "because of its efficiency compared to A2C."

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, frame_stack, _test=test)
    else:
        trainer = A2CTrainer(envs, config, frame_stack, _test=test)

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(
        num_envs, envs.observation_space.shape, frame_stack, config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(frame_stack_tensor.get())
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        with sample_timer:
            for index in range(config.num_steps):
                # Get action
                # [TODO] Get the action
                # Hint:
                #   1. Remember to disable gradient computing
                #   2. trainer.rollouts is a storage containing all data
                #   3. What observation is needed for trainer.compute_action?
                with torch.no_grad():
                    values, actions, action_log_prob = trainer.compute_action(trainer.rollouts.observations[index])
                cpu_actions = actions.view(-1).cpu().numpy()

                # Step the environment
                # (Check step_envs function, you need to implement it)
                obs, reward, done, info, masks, total_episodes, \
                total_steps, episode_rewards = step_envs(
                    cpu_actions, envs, episode_rewards, frame_stack_tensor,
                    reward_recorder, episode_length_recorder, total_steps,
                    total_episodes, config.device, test)

                rewards = torch.from_numpy(
                    reward.astype(np.float32)).view(-1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(
                    frame_stack_tensor.get(), actions.view(-1, 1),
                    action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = \
                trainer.update(trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Reset opponent if in tournament mode =====
        if tournament and iteration % config.num_steps == 0:
            # Randomly choose one agent in each iteration
            envs.reset_opponent()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths = evaluate(
                trainer, eval_envs, frame_stack, 20)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(dict(
                win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) / len(
                        evaluate_rewards)),
                evaluate_time=eval_timer.now,
                evaluate_iteration=iteration
            ))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(
                    policy_loss=policy_loss,
                    entropy=dist_entropy,
                    value_loss=value_loss,
                    total_loss=total_loss
                ),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(
                    sample_time=sample_timer.avg,
                    process_time=process_timer.avg,
                    update_time=update_timer.avg,
                    total_time=total_timer.now,
                    episode_time=sample_timer.avg + process_timer.avg +
                                 update_timer.avg
                ),
                iteration=iteration
            )

            if tournament:
                stats["opponent"] = envs.current_agent_name

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(
                    algo, iteration): stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print("Saved trainer state at <{}>. Saved progress at <{}>.".format(
                trainer_path, progress_path
            ))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()