Example #1
0
def train(args):
    # Verify algorithm and config
    global env_options, trainer_options
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    if args.envopt is not None:
        f = open(args.envopt)
        env_options = json.load(f)
    if args.trainopt is not None:
        f = open(args.trainopt)
        trainer_options = json.load(f)
    if args.opt is not None:
        opt = json.load(open(args.opt))
        env_options = opt['env']
        trainer_options = opt['trainer']

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir('work_dirs', args.log_dir)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
        options=env_options,
    )

    if env_id == "Walker2d-v3":
        healthy_z_range = (0.8, 2.0)
    elif env_id == 'Humanoid-v3':
        healthy_z_range = (1.0, 2.0)
    if 'healthy_z_range' in env_options:
        healthy_z_range = env_options['healthy_z_range']
    eval_env = gym.make(env_id,
                        healthy_z_range=healthy_z_range,
                        healthy_reward=0)
    if env_id == "Walker2d-v3":
        eval_env = Walker2d_wrapper(eval_env, env_options)

    obs_dim = envs.observation_space.shape[0]
    act_dim = envs.action_space.shape[0]
    real_obs_dim = obs_dim
    real_act_dim = act_dim
    if 'real_obs_dim' in trainer_options:
        real_obs_dim = trainer_options['real_obs_dim']
    if 'real_act_dim' in trainer_options:
        real_act_dim = trainer_options['real_act_dim']
    dim_dict = dict(obs_dim=obs_dim,
                    act_dim=act_dim,
                    real_obs_dim=real_obs_dim,
                    real_act_dim=real_act_dim)

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, trainer_options)
    else:
        raise NotImplementedError

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(num_envs,
                                          envs.observation_space.shape,
                                          config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(
        reduce_shape(frame_stack_tensor.get(), real_obs_dim))
    while True:  # Break when total_steps exceeds maximum value
        with sample_timer:
            for index in range(config.num_steps):

                trainer.model.eval()
                values, actions, action_log_prob = trainer.model.step(
                    reduce_shape(frame_stack_tensor.get(), real_obs_dim))

                cpu_actions = actions.cpu().numpy()
                cpu_actions = enlarge_shape(cpu_actions, act_dim)

                obs, reward, done, info, masks, total_episodes, \
                    total_steps, episode_rewards = step_envs(
                        cpu_actions, envs, episode_rewards, frame_stack_tensor,
                        reward_recorder, episode_length_recorder, total_steps,
                        total_episodes, config.device)

                rewards = torch.from_numpy(reward.astype(np.float32)).view(
                    -1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(
                    reduce_shape(frame_stack_tensor.get(), real_obs_dim),
                    actions, action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        trainer.model.train()
        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, total_loss = trainer.update(
                trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict)
            evaluate_stat = summary(rewards, "episode_reward")
            evaluate_stat.update(summary(eplens, "episode_length"))
            evaluate_stat.update(
                dict(evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(policy_loss=policy_loss,
                                    value_loss=value_loss,
                                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
def train(args):
    # Verify algorithm and config
    global env_options, trainer_options
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    config.activation = nn.ReLU
    if args.trainopt is not None:
        f = open(args.trainopt)
        trainer_options = json.load(f)
    if args.opt is not None:
        opt = json.load(open(args.opt))
        env_options = opt['env']
        trainer_options = opt['trainer']

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir('work_dirs', args.log_dir)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id

    main_envs = make_envs(
        env_id='Humanoid-v3',
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
    )

    aux_envs = make_envs(
        env_id='Walker2d-v3',
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
    )

    envs = [main_envs, aux_envs]

    # eval_env is main_env
    healthy_z_range = (1.0, 2.0)
    eval_env = gym.make(env_id,
                        healthy_z_range=healthy_z_range,
                        healthy_reward=0)

    main_obs_dim = 376
    main_act_dim = 17
    main_reduce_obs_dim = 46
    main_reduce_act_dim = 11
    aux_obs_dim = 17
    aux_act_dim = 6

    obs_dims = [main_reduce_obs_dim, aux_obs_dim]
    act_dims = [main_act_dim, aux_act_dim]

    dim_dict = dict(obs_a=main_reduce_obs_dim,
                    act_a=main_reduce_act_dim,
                    obs_b=aux_obs_dim,
                    act_b=aux_act_dim,
                    coeff_a=0.4,
                    coeff_b=1)
    dim_dict['act_dim'] = 17
    dim_dict['real_obs_dim'] = 46

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainerMTMT(config, dim_dict)
    else:
        raise NotImplementedError

    frame_stack_tensors = [
        FrameStackTensor(num_envs, main_envs.observation_space.shape,
                         config.device),
        FrameStackTensor(num_envs, aux_envs.observation_space.shape,
                         config.device)
    ]

    # Setup some stats helpers
    episode_rewards = [
        np.zeros([num_envs, 1], dtype=np.float),
        np.zeros([num_envs, 1], dtype=np.float)
    ]

    total_episodes = total_steps = iteration = 0

    reward_recorders = [deque(maxlen=100), deque(maxlen=100)]
    episode_length_recorders = [deque(maxlen=100), deque(maxlen=100)]

    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = [envs[i].reset() for i in range(2)]
    _ = [frame_stack_tensors[i].update(obs[i]) for i in range(2)]

    # first update
    for i in range(2):
        trainer.rollouts[i].observations[0].copy_(
            reduce_shape(frame_stack_tensors[i].get(), obs_dims[i]))

    branch_names = ['a', 'b']

    while True:  # Break when total_steps exceeds maximum value
        with sample_timer:
            # prepare rollout a
            for ind in range(2):
                for index in range(config.num_steps):
                    trainer.model.eval()
                    values, actions, action_log_prob = trainer.model.step(
                        reduce_shape(frame_stack_tensors[ind].get(),
                                     obs_dims[ind]),
                        deterministic=False,
                        branch=branch_names[ind])
                    cpu_actions = actions.cpu().numpy()
                    cpu_actions = enlarge_shape(cpu_actions, act_dims[ind])

                    # obs, done, info not needed, we have masks & obs in frame_stack_tensors
                    _, reward, _, _, masks, new_total_episodes, new_total_steps, episode_rewards[ind] = \
                        step_envs(cpu_actions, envs[ind], episode_rewards[ind], frame_stack_tensors[ind],
                                  reward_recorders[ind], episode_length_recorders[ind],
                                  total_steps, total_episodes, config.device)

                    if ind == 0:
                        total_episodes = new_total_episodes
                        total_steps = new_total_steps

                    rewards = torch.from_numpy(reward.astype(np.float32)).view(
                        -1, 1).to(config.device)

                    trainer.rollouts[ind].insert(
                        reduce_shape(frame_stack_tensors[ind].get(),
                                     obs_dims[ind]), actions, action_log_prob,
                        values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                for i in range(2):
                    next_value = trainer.compute_values(
                        trainer.rollouts[i].observations[-1], branch_names[i])
                    trainer.rollouts[i].compute_returns(
                        next_value, config.GAMMA)

        trainer.model.train()
        # ===== Update Policy =====
        with update_timer:
            losses = trainer.update(trainer.rollouts[0], trainer.rollouts[1])
            policy_loss, value_loss, total_loss = list(zip(*losses))
            trainer.rollouts[0].after_update()
            trainer.rollouts[1].after_update()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            # seems ok, by default model is dealing with task1
            rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict)
            evaluate_stat = summary(rewards, "episode_reward")
            evaluate_stat.update(summary(eplens, "episode_length"))
            evaluate_stat.update(
                dict(evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward_a=summary(reward_recorders[0],
                                                  "episode_reward"),
                training_episode_length_a=summary(episode_length_recorders[0],
                                                  "episode_length"),
                training_episode_reward_b=summary(reward_recorders[1],
                                                  "episode_reward"),
                training_episode_length_b=summary(episode_length_recorders[1],
                                                  "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats_a=dict(policy_loss=policy_loss[0],
                                      value_loss=value_loss[0],
                                      total_loss=total_loss[0]),
                learning_stats_b=dict(policy_loss=policy_loss[1],
                                      value_loss=value_loss[1],
                                      total_loss=total_loss[1]),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    elif algo == "A2C":
        config = a2c_config
    else:
        raise ValueError("args.algo must in [PPO, A2C]")
    config.num_envs = args.num_envs
    assert args.env_id in ["cPong-v0", "CartPole-v0",
                           "cPongTournament-v0"]

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir(args.log_dir, algo)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
        resized_dim=config.resized_dim
    )
    eval_envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=False,
        resized_dim=config.resized_dim
    )
    test = env_id == "CartPole-v0"
    tournament = env_id == "cPongTournament-v0"
    frame_stack = 4 if not test else 1
    if tournament:
        assert algo == "PPO", "Using PPO in tournament is a good idea, " \
                              "because of its efficiency compared to A2C."

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, frame_stack, _test=test)
    else:
        trainer = A2CTrainer(envs, config, frame_stack, _test=test)

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(
        num_envs, envs.observation_space.shape, frame_stack, config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(frame_stack_tensor.get())
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        with sample_timer:
            for index in range(config.num_steps):
                # Get action
                # [TODO] Get the action
                # Hint:
                #   1. Remember to disable gradient computing
                #   2. trainer.rollouts is a storage containing all data
                #   3. What observation is needed for trainer.compute_action?
                with torch.no_grad():
                    values, actions, action_log_prob = trainer.compute_action(trainer.rollouts.observations[index])
                cpu_actions = actions.view(-1).cpu().numpy()

                # Step the environment
                # (Check step_envs function, you need to implement it)
                obs, reward, done, info, masks, total_episodes, \
                total_steps, episode_rewards = step_envs(
                    cpu_actions, envs, episode_rewards, frame_stack_tensor,
                    reward_recorder, episode_length_recorder, total_steps,
                    total_episodes, config.device, test)

                rewards = torch.from_numpy(
                    reward.astype(np.float32)).view(-1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(
                    frame_stack_tensor.get(), actions.view(-1, 1),
                    action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = \
                trainer.update(trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Reset opponent if in tournament mode =====
        if tournament and iteration % config.num_steps == 0:
            # Randomly choose one agent in each iteration
            envs.reset_opponent()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths = evaluate(
                trainer, eval_envs, frame_stack, 20)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(dict(
                win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) / len(
                        evaluate_rewards)),
                evaluate_time=eval_timer.now,
                evaluate_iteration=iteration
            ))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(
                    policy_loss=policy_loss,
                    entropy=dist_entropy,
                    value_loss=value_loss,
                    total_loss=total_loss
                ),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(
                    sample_time=sample_timer.avg,
                    process_time=process_timer.avg,
                    update_time=update_timer.avg,
                    total_time=total_timer.now,
                    episode_time=sample_timer.avg + process_timer.avg +
                                 update_timer.avg
                ),
                iteration=iteration
            )

            if tournament:
                stats["opponent"] = envs.current_agent_name

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(
                    algo, iteration): stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print("Saved trainer state at <{}>. Saved progress at <{}>.".format(
                trainer_path, progress_path
            ))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
Example #4
0
def train(args):
    # Verify algorithm and config
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    elif algo == "A2C":
        config = a2c_config
    else:
        raise ValueError("args.algo must in [PPO, A2C]")
    config.num_envs = args.num_envs

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir(args.log_dir, algo)

    # Create vectorized environments
    num_envs = args.num_envs
    env_name = args.env_name

    # Prepare tensorboard file
    args.save_log = 'Pairtrding-{}'.format(time.strftime("%Y%m%d-%H%M%S"))
    generate_date = str(datetime.now().date())

    writer = SummaryWriter(args.log_dir + '/runs/' + generate_date + '/' +
                           args.save_log)

    # download stock price data from yahoo finance
    stocklist = [
        '0700.hk', '2318.hk', '3988.hk', '0998.hk', '1398.hk', '3968.hk',
        '0981.hk', '0005.hk'
    ]
    # 腾讯,平安,中银,中信,工商,招商,中芯国际,汇丰
    stocktickers = ' '.join(stocklist)

    data = yf.download(tickers=stocktickers,
                       start="2010-01-01",
                       end="2019-12-31")
    data = data['Close']
    columnchange = []
    for stock in data.columns:
        name = stock + 'change'
        columnchange.append(name)
        data[name] = data[stock] - data[stock].shift(1)

    CorrDict = {}
    for i in columnchange:
        for j in columnchange:
            if i != j and (i, j) not in CorrDict:
                CorrDict[(i, j)] = data[i].corr(data[j])
    pair = list(max(CorrDict))
    pair.append(pair[0][:7])
    pair.append(pair[1][:7])
    dataremain = data[pair]

    from sklearn import linear_model
    import numpy as np
    model = linear_model.LinearRegression()
    model.fit(dataremain[pair[0]][1:-250].to_numpy().reshape(-1, 1),
              y=dataremain[pair[1]][1:-250])
    beta = model.coef_[0]

    dataremain['Spread'] = beta * data[pair[0]] - data[pair[1]]
    Spreadmean = dataremain['Spread'].mean()
    Spreadstd = dataremain['Spread'].std()
    dataremain['Z-score'] = (dataremain['Spread'] - Spreadmean) / Spreadstd

    envs = PairtradingEnv(stock1=dataremain[pair[2]][:-250],
                          stock2=dataremain[pair[3]][:-250])
    eval_envs = PairtradingEnv(stock1=dataremain[pair[2]][-250:],
                               stock2=dataremain[pair[3]][-250:])

    baseline_config = baselineConfig(mean=Spreadmean, std=Spreadstd, beta=beta)
    baseline_trainer = baseline(env=envs, config=baseline_config)

    baseline_eval = baseline(env=eval_envs, config=baseline_config)

    test = env_name == "CartPole-v0"
    frame_stack = args.input_length if not test else 1

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, frame_stack, _test=test)
    else:
        trainer = A2CTrainer(envs, config, frame_stack, _test=test)

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(
        num_envs, envs.observation_space.shape, frame_stack,
        config.device)  # envs.observation_space.shape: 1,42,42

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    episode_values = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        # episode_values = []
        episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
        for env_id in range(num_envs):
            obs = envs.reset()  # obs.shape: 15,1,42,42
            frame_stack_tensor.update(obs, env_id)
            trainer.rollouts.observations[0, env_id].copy_(
                frame_stack_tensor.get(env_id)
            )  #trainer.rollouts.observations.shape: torch.Size([201, 15, 4, 42, 42])

            with sample_timer:
                for index in range(config.num_steps):
                    # Get action
                    # [TODO] Get the action
                    # Hint:
                    #   1. Remember to disable gradient computing
                    #   2. trainer.rollouts is a storage containing all data
                    #   3. What observation is needed for trainer.compute_action?
                    with torch.no_grad():
                        values, actions_cash, action_log_prob_cash, actions_beta, action_log_prob_beta = trainer.compute_action(
                            trainer.rollouts.observations[index, env_id])

                    act = baseline_trainer.compute_action(
                        actions_cash.view(-1), actions_beta.view(-1))

                    cpu_actions = act

                    # Step the environment
                    # (Check step_envs function, you need to implement it)
                    obs, reward, done, masks, total_episodes, \
                    total_steps, episode_rewards, episode_values = step_envs(
                        cpu_actions, envs, env_id, episode_rewards, episode_values, frame_stack_tensor,
                        reward_recorder, episode_length_recorder, total_steps,
                        total_episodes, config.device, test)

                    rewards = torch.from_numpy(
                        np.array(reward).astype(np.float32)).view(-1).to(
                            config.device)
                    # Store samples
                    trainer.rollouts.insert(frame_stack_tensor.get(env_id),
                                            actions_cash.view(-1),
                                            action_log_prob_cash.view(-1),
                                            actions_beta.view(-1),
                                            action_log_prob_beta.view(-1),
                                            values.view(-1), rewards,
                                            masks.view(-1), env_id)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = \
                trainer.update(trainer.rollouts)
            trainer.rollouts.after_update()

            # Add training statistics to tensorboard log file
            writer.add_scalar('train_policy_loss', policy_loss, iteration)
            writer.add_scalar('train_value_loss', value_loss, iteration)
            writer.add_scalar('train_dist_entropy', dist_entropy, iteration)
            writer.add_scalar('train_total_loss', total_loss, iteration)
            writer.add_scalar('train_episode_rewards',
                              np.mean(episode_rewards), iteration)
            writer.add_scalar('train_episode_values',
                              np.array(episode_values).mean(), iteration)

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths, evaluate_values = evaluate(
                trainer, eval_envs, baseline_eval, frame_stack, 5)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(
                dict(win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) /
                    len(evaluate_rewards)),
                     evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration,
                     evaluate_values=float(np.array(evaluate_values).mean())))

            # Add evaluation statistics to tensorboard log file
            writer.add_scalar('eval_episode_rewards',
                              np.array(evaluate_rewards).mean(),
                              iteration // config.eval_freq)
            writer.add_scalar('eval_episode_values',
                              np.array(evaluate_values).mean(),
                              iteration // config.eval_freq)

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_values=summary(episode_values,
                                                "episode_value"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(policy_loss=policy_loss,
                                    entropy=dist_entropy,
                                    value_loss=value_loss,
                                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        if iteration >= args.max_steps:
            break

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
Example #5
0
def _train(trainer, envs, eval_envs, config, num_envs, algo, log_dir,
           tournament, test):
    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}
    while True:  # Break when total_steps exceeds maximum value
        # ===== Sample Data =====
        with sample_timer:
            for index in range(config.num_steps):
                # Get action
                if hasattr(trainer.model, 'reset_state'):
                    trainer.model.reset_state()
                with torch.no_grad():
                    values, actions, action_log_prob = trainer.compute_action(
                        trainer.rollouts.processed_observations[index])
                    trainer.model.update_hidden(actions)

                if trainer.discrete:
                    cpu_actions = actions.view(-1).cpu().numpy()
                else:
                    cpu_actions = actions.cpu().numpy()

                # Step the environment
                # (Check step_envs function, you need to implement it)
                obs, reward, done, info, masks, total_episodes, total_steps, episode_rewards = step_envs(
                    cpu_actions, envs, episode_rewards, reward_recorder,
                    episode_length_recorder, total_steps, total_episodes,
                    config.device)

                rewards = torch.from_numpy(reward.astype(np.float32)).view(
                    -1, 1).to(config.device)

                # Store samples
                if trainer.discrete:
                    actions = actions.view(-1, 1)

                with torch.no_grad():
                    raw_obs = trainer.process_obs(obs)
                    processed_obs = trainer.model.world_model(raw_obs).detach()
                trainer.rollouts.insert(obs, actions, action_log_prob, values,
                                        rewards, masks, processed_obs)
                # trainer.rollouts.insert(obs, actions, action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.processed_observations[-1])
            trainer.rollouts.compute_returns(next_value, config.gamma)

        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, dist_entropy, total_loss = trainer.update(
                trainer.rollouts)
            #   vae_loss, mdrnn_loss\
            # = trainer.update(trainer.rollouts)
            trainer.model.reset_state()
            trainer.rollouts.after_update()

        # ===== Reset opponent if in tournament mode =====
        if tournament and iteration % config.num_steps == 0:
            # Randomly choose one agent in each iteration
            envs.reset_opponent()

        # ===== Evaluate Current Policy =====
        if eval_envs is not None and iteration % config.eval_freq == 0:
            eval_timer = Timer()
            evaluate_rewards, evaluate_lengths = evaluate(
                trainer, eval_envs, 20)
            evaluate_stat = summary(evaluate_rewards, "episode_reward")
            if evaluate_lengths:
                evaluate_stat.update(
                    summary(evaluate_lengths, "episode_length"))
            evaluate_stat.update(
                dict(win_rate=float(
                    sum(np.array(evaluate_rewards) >= 0) /
                    len(evaluate_rewards)),
                     evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(
                    policy_loss=policy_loss,
                    entropy=dist_entropy,
                    value_loss=value_loss,
                    # vae_loss= vae_loss,
                    # mdrnn_loss=mdrnn_loss,
                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            if tournament:
                stats["opponent"] = envs.current_agent_name

            progress.append(stats)

            from IPython.display import clear_output
            clear_output()

            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })
            progress_path = save_progress(log_dir, progress)

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        if total_steps > int(args.max_steps):
            break

        iteration += 1