def test_ppo(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    # orthogonal initialization
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.orthogonal_(m.weight)
            torch.nn.init.zeros_(m.bias)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       args.gamma,
                       max_grad_norm=args.max_grad_norm,
                       eps_clip=args.eps_clip,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       action_range=None,
                       gae_lambda=args.gae_lambda,
                       reward_normalization=args.rew_norm,
                       dual_clip=args.dual_clip,
                       value_clip=args.value_clip)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Esempio n. 2
0
def test_ppo(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = DummyVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.state_shape,
              hidden_sizes=args.hidden_sizes,
              device=args.device)
    actor = Actor(net, args.action_shape, device=args.device).to(args.device)
    critic = Critic(net, device=args.device).to(args.device)
    # orthogonal initialization
    for m in set(actor.modules()).union(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.orthogonal_(m.weight)
            torch.nn.init.zeros_(m.bias)
    optim = torch.optim.Adam(set(actor.parameters()).union(
        critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       discount_factor=args.gamma,
                       max_grad_norm=args.max_grad_norm,
                       eps_clip=args.eps_clip,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       gae_lambda=args.gae_lambda,
                       reward_normalization=args.rew_norm,
                       dual_clip=args.dual_clip,
                       value_clip=args.value_clip,
                       action_space=env.action_space,
                       deterministic_eval=True,
                       advantage_normalization=args.norm_adv,
                       recompute_advantage=args.recompute_adv)
    # collector
    train_collector = Collector(
        policy, train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)
    logger = TensorboardLogger(writer)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(mean_rewards):
        return mean_rewards >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              step_per_collect=args.step_per_collect,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              logger=logger)
    assert stop_fn(result['best_reward'])

    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        policy.eval()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        rews, lens = result["rews"], result["lens"]
        print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
Esempio n. 3
0
def build_policy(no, args):
    if no == 0:
        # server policy
        net = Net(args.layer_num, args.state_shape, device=args.device)
        actor = ServerActor(net, (10, )).to(args.device)
        critic = Critic(net).to(args.device)
        # orthogonal initialization
        for m in list(actor.modules()) + list(critic.modules()):
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.orthogonal_(m.weight)
                torch.nn.init.zeros_(m.bias)
        optim = torch.optim.Adam(list(actor.parameters()) +
                                 list(critic.parameters()),
                                 lr=args.lr)
        dist = torch.distributions.Categorical
        policy = A2CPolicy(actor,
                           critic,
                           optim,
                           dist,
                           discount_factor=args.gamma,
                           gae_lambda=args.gae_lambda,
                           vf_coef=args.vf_coef,
                           ent_coef=args.ent_coef,
                           max_grad_norm=args.max_grad_norm,
                           reward_normalization=args.rew_norm)
    elif no == 1:
        # ND policy
        net = Net(args.layer_num, (4, ), device=args.device)
        actor = NFActor(net, (10, )).to(args.device)
        critic = Critic(net).to(args.device)
        # orthogonal initialization
        for m in list(actor.modules()) + list(critic.modules()):
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.orthogonal_(m.weight)
                torch.nn.init.zeros_(m.bias)
        optim = torch.optim.Adam(list(actor.parameters()) +
                                 list(critic.parameters()),
                                 lr=args.lr)
        dist = torch.distributions.Categorical
        policy = A2CPolicy(actor,
                           critic,
                           optim,
                           dist,
                           discount_factor=args.gamma,
                           gae_lambda=args.gae_lambda,
                           vf_coef=args.vf_coef,
                           ent_coef=args.ent_coef,
                           max_grad_norm=args.max_grad_norm,
                           reward_normalization=args.rew_norm)
    elif no == 2:
        net = Net(args.layer_num, (4, ), device=args.device)
        actor = RelayActor(net, (10, )).to(args.device)
        critic = Critic(net).to(args.device)
        # orthogonal initialization
        for m in list(actor.modules()) + list(critic.modules()):
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.orthogonal_(m.weight)
                torch.nn.init.zeros_(m.bias)
        optim = torch.optim.Adam(list(actor.parameters()) +
                                 list(critic.parameters()),
                                 lr=args.lr)
        dist = torch.distributions.Categorical
        policy = A2CPolicy(actor,
                           critic,
                           optim,
                           dist,
                           discount_factor=args.gamma,
                           gae_lambda=args.gae_lambda,
                           vf_coef=args.vf_coef,
                           ent_coef=args.ent_coef,
                           max_grad_norm=args.max_grad_norm,
                           reward_normalization=args.rew_norm)
    else:
        net = Net(args.layer_num, (4, ), device=args.device)
        actor = NFActor(net, (10, )).to(args.device)
        critic = Critic(net).to(args.device)
        # orthogonal initialization
        for m in list(actor.modules()) + list(critic.modules()):
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.orthogonal_(m.weight)
                torch.nn.init.zeros_(m.bias)
        optim = torch.optim.Adam(list(actor.parameters()) +
                                 list(critic.parameters()),
                                 lr=args.lr)
        dist = torch.distributions.Categorical
        policy = A2CPolicy(actor,
                           critic,
                           optim,
                           dist,
                           discount_factor=args.gamma,
                           gae_lambda=args.gae_lambda,
                           vf_coef=args.vf_coef,
                           ent_coef=args.ent_coef,
                           max_grad_norm=args.max_grad_norm,
                           reward_normalization=args.rew_norm)
    return policy
Esempio n. 4
0
# config random seed
np.random.seed(config['seed'])
torch.manual_seed(config['seed'])
train_envs.seed(config['seed'])

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

net = Net(training_config['layer_num'], state_shape,
          device=config['device']).to(config['device'])
actor = Actor(net, action_shape).to(config['device'])
critic = Critic(net).to(config['device'])

# orthogonal initialization
for m in list(actor.modules()) + list(critic.modules()):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.orthogonal_(m.weight)
        torch.nn.init.zeros_(m.bias)

optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()),
                         lr=training_config['learning_rate'])
dist = torch.distributions.Categorical

policy = PPOPolicy(actor,
                   critic,
                   optim,
                   dist,
                   training_config['gamma'],
                   eps_clip=training_config["eps_clip"],
                   vf_coef=training_config["vf_coef"],