Exemple #1
0
def test_dqn(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, args.action_shape, args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(
        net, optim, args.gamma, args.n_step,
        use_target_network=args.target_update_freq > 0,
        target_update_freq=args.target_update_freq)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'dqn')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    def train_fn(x):
        policy.set_eps(args.eps_train)

    def test_fn(x):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.test_num,
        args.batch_size, train_fn=train_fn, test_fn=test_fn,
        stop_fn=stop_fn, save_fn=save_fn, writer=writer)

    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    pprint.pprint(result)
def watch(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
) -> None:
    env = TicTacToeEnv(args.board_size, args.win_size)
    policy, optim = get_agents(args,
                               agent_learn=agent_learn,
                               agent_opponent=agent_opponent)
    collector = Collector(policy, env)
    result = collector.collect(n_episode=1, render=args.render)
    print(f'Final reward: {result["rew"]}, length: {result["len"]}')
    collector.close()
Exemple #3
0
def test_ppo(args=get_args()):
    env = create_atari_environment(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space().shape or env.action_space().n
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv([
        lambda: create_atari_environment(args.task)
        for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv([
        lambda: create_atari_environment(args.task)
        for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(
        actor, critic, optim, dist, args.gamma,
        max_grad_norm=args.max_grad_norm,
        eps_clip=args.eps_clip,
        vf_coef=args.vf_coef,
        ent_coef=args.ent_coef,
        action_range=None)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size),
        preprocess_fn=preprocess_fn)
    test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'ppo')

    def stop_fn(x):
        if env.env.spec.reward_threshold:
            return x >= env.spec.reward_threshold
        else:
            return False

    # trainer
    result = onpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
        args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = create_atari_environment(args.task)
        collector = Collector(policy, env, preprocess_fn=preprocess_fn)
        result = collector.collect(n_step=2000, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #4
0
def test_ddpg(args=get_args()):
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(
        args.layer_num, args.state_shape, args.action_shape,
        args.max_action, args.device
    ).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(
        args.layer_num, args.state_shape, args.action_shape, args.device
    ).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor, actor_optim, critic, critic_optim,
        args.tau, args.gamma, args.exploration_noise,
        [env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=True, ignore_done=True)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'ddpg')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.test_num,
        args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #5
0
def test_dqn(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, args.action_shape, args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(
        net, optim, args.gamma, args.n_step,
        use_target_network=args.target_update_freq > 0,
        target_update_freq=args.target_update_freq)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'dqn')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    def train_fn(x):
        policy.set_eps(args.eps_train)

    def test_fn(x):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.test_num,
        args.batch_size, train_fn=train_fn, test_fn=test_fn,
        stop_fn=stop_fn, writer=writer, task=args.task)

    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #6
0
def test_pg(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              device=args.device,
              softmax=True)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PGPolicy(net, optim, dist, args.gamma)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'pg')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #7
0
def test_a2c(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = A2CPolicy(
        actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda,
        vf_coef=args.vf_coef, ent_coef=args.ent_coef,
        max_grad_norm=args.max_grad_norm)
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'a2c')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
        args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,
        writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
def test(args=get_args()):
    # Let's watch its performance!
    env = LimitWrapper(
        StateBonus(ImgObsWrapper(gym.make('MiniGrid-FourRooms-v0'))))
    args.state_shape = env.observation_space.shape
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # model
    net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape,
              args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       use_target_network=args.target_update_freq > 0,
                       target_update_freq=args.target_update_freq)

    policy.load_state_dict(torch.load('dqn.pth'))

    collector = Collector(policy, env)
    result = collector.collect(n_episode=1, render=args.render)
    print(f'Final reward: {result["rew"]}, length: {result["len"]}')
    collector.close()
Exemple #9
0
def train(hyper: dict):
    env_id = 'CartPole-v1'
    env = gym.make(env_id)
    hyper['state_dim'] = 4
    hyper['action_dim'] = 2

    train_envs = VectorEnv([lambda: gym.make(env_id) for _ in range(hyper['training_num'])])
    test_envs = SubprocVectorEnv([lambda: gym.make(env_id) for _ in range(hyper['test_num'])])

    if hyper['seed']:
        np.random.seed(hyper['random_seed'])
        torch.manual_seed(hyper['random_seed'])
        train_envs.seed(hyper['random_seed'])
        test_envs.seed(hyper['random_seed'])

    device = Pytorch.device()

    net = Net(hyper['layer_num'], hyper['state_dim'], device=device)
    actor = Actor(net, hyper['action_dim']).to(device)
    critic = Critic(net).to(device)
    optim = torch.optim.Adam(list(
        actor.parameters()) + list(critic.parameters()), lr=hyper['learning_rate'])
    dist = torch.distributions.Categorical
    policy = A2CPolicy(
        actor, critic, optim, dist, hyper['gamma'], vf_coef=hyper['vf_coef'],
        ent_coef=hyper['ent_coef'], max_grad_norm=hyper['max_grad_norm'])
    # collector
    train_collector = Collector(
        policy, train_envs, ReplayBuffer(hyper['capacity']))
    test_collector = Collector(policy, test_envs)

    writer = SummaryWriter('./a2c')

    def stop_fn(x):
        if env.env.spec.reward_threshold:
            return x >= env.spec.reward_threshold
        else:
            return False

    result = onpolicy_trainer(
        policy, train_collector, test_collector, hyper['epoch'],
        hyper['step_per_epoch'], hyper['collect_per_step'], hyper['repeat_per_collect'],
        hyper['test_num'], hyper['batch_size'], stop_fn=stop_fn, writer=writer,
        task=env_id)
    train_collector.close()
    test_collector.close()
    pprint.pprint(result)
    # 测试
    env = gym.make(env_id)
    collector = Collector(policy, env)
    result = collector.collect(n_episode=1, render=hyper['render'])
    print(f'Final reward: {result["rew"]}, length: {result["len"]}')
    collector.close()
Exemple #10
0
def test_ppo(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym_make()
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.action_range = [env.action_shape.low[0], env.action_space.high[0]]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym_make()
    train_envs = VectorEnv(
        [lambda: gym_make() for _ in range(args.training_num)])
    # test_envs = gym_make()
    test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    policy = init_policy(args, env)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              save_fn=save_fn,
                              writer=writer)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym_make()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #11
0
    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))
>>>>>>> 4fd826761c9884457928da9dac52d7ee1c51443a

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(
        policy, train_collector, test_collector, args.epoch,
        args.step_per_epoch, args.collect_per_step, args.repeat_per_collect,
        args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn,
        writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()


if __name__ == '__main__':
    # test_fn()
    test_pg()
Exemple #12
0
def test_td3(args=get_args()):
    # initialize environment
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(args.layer_num,
                  args.state_shape,
                  args.action_shape,
                  args.max_action,
                  args.device,
                  hidden_layer_size=args.hidden_size).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num,
                     args.state_shape,
                     args.action_shape,
                     args.device,
                     hidden_layer_size=args.hidden_size).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num,
                     args.state_shape,
                     args.action_shape,
                     args.device,
                     hidden_layer_size=args.hidden_size).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)

    policy = TD3Policy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        args.tau,
        args.gamma,
        GaussianNoise(sigma=args.exploration_noise),
        args.policy_noise,
        args.update_actor_freq,
        args.noise_clip,
        action_range=[env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=args.rew_norm,
        ignore_done=False)
    # collector
    if args.training_num == 0:
        max_episode_steps = train_envs._max_episode_steps
    else:
        max_episode_steps = train_envs.envs[0]._max_episode_steps
    train_collector = Collector(
        policy, train_envs,
        ReplayBuffer(args.buffer_size, max_ep_len=max_episode_steps))
    test_collector = Collector(policy, test_envs, mode='test')
    # log
    log_path = os.path.join(args.logdir, args.task, 'td3', str(args.seed))
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    env.spec.reward_threshold = 100000

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_exact_trainer(policy,
                                     train_collector,
                                     test_collector,
                                     args.epoch,
                                     args.step_per_epoch,
                                     args.collect_per_step,
                                     args.test_num,
                                     args.batch_size,
                                     stop_fn=stop_fn,
                                     save_fn=save_fn,
                                     writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #13
0
def test_ppo(args=get_args()):
    torch.set_num_threads(1)  # for poor CPU
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, device=args.device)
    actor = Actor(net, args.action_shape).to(args.device)
    critic = Critic(net).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PPOPolicy(actor,
                       critic,
                       optim,
                       dist,
                       args.gamma,
                       max_grad_norm=args.max_grad_norm,
                       eps_clip=args.eps_clip,
                       vf_coef=args.vf_coef,
                       ent_coef=args.ent_coef,
                       action_range=None,
                       gae_lambda=args.gae_lambda,
                       reward_normalization=args.rew_norm,
                       dual_clip=args.dual_clip,
                       value_clip=args.value_clip)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    pprint.pprint(result)
Exemple #14
0
def run_pg(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n

    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])

    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])

    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)

    # model
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              device=args.device,
              softmax=True)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    dist = torch.distributions.Categorical
    policy = PGPolicy(net,
                      optim,
                      dist,
                      args.gamma,
                      reward_normalization=args.rew_norm)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    if not os.path.isdir(os.path.join(args.logdir)):
        os.mkdir(os.path.join(args.logdir))

    if not os.path.isdir(os.path.join(args.logdir, args.task)):
        os.mkdir(os.path.join(args.logdir, args.task))

    if not os.path.isdir(os.path.join(args.logdir, args.task, 'pg')):
        os.mkdir(os.path.join(args.logdir, args.task, 'pg'))

    log_path = os.path.join(args.logdir, args.task, 'pg')

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    onpolicy_trainer(policy,
                     train_collector,
                     test_collector,
                     args.epoch,
                     args.step_per_epoch,
                     args.collect_per_step,
                     args.repeat_per_collect,
                     args.test_num,
                     args.batch_size,
                     stop_fn=stop_fn,
                     save_fn=save_fn)

    train_collector.close()
    test_collector.close()

    if __name__ == '__main__':

        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #15
0
def test_drqn(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    # train_envs = gym.make(args.task)
    # you can also use tianshou.env.SubprocVectorEnv
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Recurrent(args.layer_num, args.state_shape, args.action_shape,
                    args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       use_target_network=args.target_update_freq > 0,
                       target_update_freq=args.target_update_freq)
    # collector
    train_collector = Collector(
        policy, train_envs,
        ReplayBuffer(args.buffer_size,
                     stack_num=args.stack_num,
                     ignore_obs_next=True))
    # the stack_num is for RNN training: sample framestack obs
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'drqn')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    def train_fn(x):
        policy.set_eps(args.eps_train)

    def test_fn(x):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)

    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #16
0
def _test_ppo(args=get_args()):
    # just a demo, I have not made it work :(
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    critic = Critic(args.layer_num, args.state_shape,
                    device=args.device).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Normal
    policy = PPOPolicy(
        actor,
        critic,
        optim,
        dist,
        args.gamma,
        max_grad_norm=args.max_grad_norm,
        eps_clip=args.eps_clip,
        vf_coef=args.vf_coef,
        ent_coef=args.ent_coef,
        action_range=[env.action_space.low[0], env.action_space.high[0]])
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    train_collector.collect(n_step=args.step_per_epoch)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'ppo')

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              writer=writer,
                              task=args.task)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #17
0
class View(object):
    def __init__(self, args, mask=None, name='full'):
        env = gym.make(args.task)
        if args.task == 'Pendulum-v0':
            env.spec.reward_threshold = -250
        self.state_shape = env.observation_space.shape or env.observation_space.n
        self.action_shape = env.action_space.shape or env.action_space.n
        self.max_action = env.action_space.high[0]

        self.stop_fn = lambda x: x >= env.spec.reward_threshold

        # env
        self.train_envs = VectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.training_num)])
        self.test_envs = SubprocVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.test_num)])

        # mask
        state_dim = int(np.prod(self.state_shape))
        self._view_mask = torch.ones(state_dim)
        if mask == 'even':
            for i in range(0, state_dim, 2):
                self._view_mask[i] = 0
        elif mask == "odd":
            for i in range(1, state_dim, 2):
                self._view_mask[i] = 0
        elif type(mask) == int:
            self._view_mask[mask] = 0

        # policy
        self.actor = ActorProbWithView(
            args.layer_num, self.state_shape, self.action_shape,
            self.max_action, self._view_mask, args.device
        ).to(args.device)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=args.actor_lr)
        self.critic1 = CriticWithView(
            args.layer_num, self.state_shape, self._view_mask, self.action_shape, args.device
        ).to(args.device)
        self.critic1_optim = torch.optim.Adam(self.critic1.parameters(), lr=args.critic_lr)
        self.critic2 = CriticWithView(
            args.layer_num, self.state_shape, self._view_mask, self.action_shape, args.device
        ).to(args.device)
        self.critic2_optim = torch.optim.Adam(self.critic2.parameters(), lr=args.critic_lr)
        self.policy = SACPolicy(
            self.actor, self.actor_optim, self.critic1, self.critic1_optim, self.critic2,
            self.critic2_optim, args.tau, args.gamma, args.alpha,
            [env.action_space.low[0], env.action_space.high[0]],
            reward_normalization=True, ignore_done=True)

        # collector
        self.train_collector = Collector(self.policy, self.train_envs,
                                         ReplayBuffer(args.buffer_size))
        self.test_collector = Collector(self.policy, self.test_envs)

        # log
        self.writer = SummaryWriter(f"{args.logdir}/{args.task}/sac/{args.note}/{name}")

    def seed(self, _seed):
        self.train_envs.seed(_seed)
        self.test_envs.seed(_seed)

    def close(self):
        self.train_collector.close()
        self.test_collector.close()

    def train(self):
        self.actor.train()
        self.critic1.train()
        self.critic2.train()

    def learn_from_demos(self, batch, demo, peer=0):
        acts = self.policy(batch).act
        demo = demo.act.detach()
        loss = F.mse_loss(acts, demo)
        if peer != 0:
            peer_demo = demo[torch.randperm(len(demo))]
            loss -= peer * F.mse_loss(acts, peer_demo)
        self.policy.actor_optim.zero_grad()
        loss.backward()
        self.policy.actor_optim.step()
def test_sac(args=get_args()):
    # initialize environment
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num,
                      args.state_shape,
                      args.action_shape,
                      args.max_action,
                      args.device,
                      hidden_layer_size=args.hidden_size).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num,
                     args.state_shape,
                     args.action_shape,
                     args.device,
                     hidden_layer_size=args.hidden_size).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num,
                     args.state_shape,
                     args.action_shape,
                     args.device,
                     hidden_layer_size=args.hidden_size).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    # energy-based discriminator
    disc = Critic(
        args.layer_num,
        np.prod(args.state_shape) + np.prod(args.action_shape),
        0,
        args.device,
        hidden_layer_size=args.hidden_size,
        output_dim=np.prod(args.state_shape) + 1,
    ).to(args.device)
    disc_optim = torch.optim.Adam(disc.parameters(), lr=args.critic_lr)
    # tunable temperature
    beta = torch.ones(1, requires_grad=True, device=args.device)
    beta_optim = torch.optim.Adam([beta], lr=args.critic_lr)

    if args.auto_alpha:
        target_entropy = -np.prod(env.action_space.shape)
        log_alpha = torch.zeros(1, requires_grad=True, device=args.device)
        alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
        alpha = (target_entropy, log_alpha, alpha_optim)
    else:
        alpha = args.alpha

    rng = np.random.RandomState(seed=args.seed)

    policy = SACMUTRIRB2BPolicy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        args.tau,
        args.gamma,
        alpha,
        [env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=False,
        ignore_done=False,
        norm_diff=False,
        use_diff=False,
        process_tri=(lambda x, beta: process_tri(x, rng=rng, beta=beta)
                     ),  # continuous transition construction
        beta=(beta, beta_optim),  # the tunable temperature
        discriminator=(disc, disc_optim),  # the energy-based discriminator
        tor_diff=args.tor_diff  # the tolerance of distance
    )
    # collector
    if args.training_num == 0:
        max_episode_steps = train_envs._max_episode_steps
    else:
        max_episode_steps = train_envs.envs[0]._max_episode_steps
    train_collector = Collector(
        policy, train_envs,
        ReplayBufferTriple(args.buffer_size, max_ep_len=max_episode_steps))
    test_collector = Collector(policy, test_envs, mode='test')
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac_ct', str(args.seed))
    writer = SummaryWriter(log_path)

    def save_fn(policy, name='policy.pth'):
        torch.save(policy.state_dict(), os.path.join(log_path, name))

    env.spec.reward_threshold = 100000

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_exact_trainer(policy,
                                     train_collector,
                                     test_collector,
                                     args.epoch,
                                     args.step_per_epoch,
                                     args.collect_per_step,
                                     args.test_num,
                                     args.batch_size,
                                     stop_fn=stop_fn,
                                     save_fn=save_fn,
                                     writer=writer,
                                     epochs_to_save=[1, 50, 100, 150, 200])
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #19
0
class View(object):
    def __init__(self, args, mask=None, name='full'):
        env = gym.make(args.task)
        self.stop_fn = lambda x: x >= env.spec.reward_threshold
        self.state_shape = env.observation_space.shape or env.observation_space.n
        self.action_shape = env.action_space.shape or env.action_space.n
        self.buffer = ReplayBuffer(400000)

        # Env
        # train_envs = gym.make(args.task)
        self.train_envs = SubprocVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.training_num)])
        # test_envs = gym.make(args.task)
        self.test_envs = SubprocVectorEnv(
            [lambda: gym.make(args.task) for _ in range(args.test_num)])

        # Mask
        state_dim = int(np.prod(self.state_shape))
        self._view_mask = torch.ones(state_dim)
        if mask == 'even':
            for i in range(0, state_dim, 2):
                self._view_mask[i] = 0
        elif mask == "odd":
            for i in range(1, state_dim, 2):
                self._view_mask[i] = 0
        elif type(mask) == int:
            self._view_mask[mask] = 0

        # Model
        net = NetWithView(args.layer_num, self.state_shape, device=args.device,
                          mask=self._view_mask)
        self.actor = Actor(net, self.action_shape).to(args.device)
        self.critic = Critic(net).to(args.device)
        optim = torch.optim.Adam(list(
            self.actor.parameters()) + list(self.critic.parameters()), lr=args.lr)
        dist = torch.distributions.Categorical
        self.policy = PPOPolicy(
            self.actor, self.critic, optim, dist, args.gamma,
            max_grad_norm=args.max_grad_norm,
            eps_clip=args.eps_clip,
            vf_coef=args.vf_coef,
            ent_coef=args.ent_coef,
            action_range=None)

        # Collector
        self.train_collector = Collector(
            self.policy, self.train_envs, ReplayBuffer(args.buffer_size))
        self.test_collector = Collector(self.policy, self.test_envs)

        # Log
        self.writer = SummaryWriter(f'{args.logdir}/{args.task}/ppo/{args.note}/{name}')

    def seed(self, _seed):
        self.train_envs.seed(_seed)
        self.test_envs.seed(_seed)

    def close(self):
        self.train_collector.close()
        self.test_collector.close()

    def train(self):
        self.actor.train()
        self.critic.train()

    def learn_from_demos(self, batch, demo, peer=0):
        logits = self.policy(batch).logits
        demo = demo.act.detach()
        loss = F.cross_entropy(logits, demo)
        if peer != 0:
            peer_demo = demo[torch.randperm(len(demo))]
            loss -= peer * F.cross_entropy(logits, peer_demo)
        self.policy.optim.zero_grad()
        loss.backward()
        self.policy.optim.step()
def test_dqn(args=get_args()):
    env = LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task))))
    # print(env.observation_space.spaces['image'])
    # args.state_shape = env.observation_space.spaces['image'].shape
    args.state_shape = env.observation_space.shape

    args.action_shape = env.env.action_space.shape or env.env.action_space.n

    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv([
        lambda: LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task))))
        for _ in range(args.training_num)
    ])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv([
        lambda: LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task))))
        for _ in range(args.test_num)
    ])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape,
              args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       use_target_network=args.target_update_freq > 0,
                       target_update_freq=args.target_update_freq)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size * 4)
    print(len(train_collector.buffer))
    # log
    writer = SummaryWriter(args.logdir + '/' + 'dqn')

    def stop_fn(x):
        # if env.env.spec.reward_threshold:
        #     return x >= env.spec.reward_threshold
        # else:
        #     return False

        return False

    def train_fn(x):
        policy.set_eps(args.eps_train)

    def test_fn(x):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               writer=writer,
                               task=args.task)

    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task))))
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()

        torch.save(policy.state_dict(), 'dqn.pth')
Exemple #21
0
    tb.configure(host="0.0.0.0", logdir=data_path)
    url = tb.launch()
    print(f"Started Tensorboard {url} at {data_path}...")
writer = SummaryWriter(data_path)

### Configure export
def save_fn(policy):
    torch.save(policy.state_dict(), os.path.join(data_path, 'policy.pth'))

### Configure early stopping of training
def stop_fn(x):
    return x >= TARGET_EPISODE_STEPS

### Run the learning process
result = onpolicy_trainer(
    policy, train_collector, test_collector,
    **trainer_config, stop_fn=stop_fn, save_fn=save_fn,
    writer=writer, verbose=True)
print(f'Finished training! Use {result["duration"]}')

### Stop the data collectors
train_collector.close()
test_collector.close()

# Enjoy a trained agent !
env = env_creator()
collector = Collector(policy, env)
result = collector.collect(n_episode=1, render=env.dt)
print(f'Final reward: {result["rew"]}, length: {result["len"]}')
collector.close()
Exemple #22
0
def train(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym_make()
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.action_range = [env.action_space.low, env.action_space.high]

    train_envs = VectorEnv(
        [lambda: gym_make() for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = Actor(args.layer, args.state_shape, args.action_shape,
                  args.action_range, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = Critic(args.layer, args.state_shape, args.action_shape,
                    args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(actor,
                        actor_optim,
                        critic,
                        critic_optim,
                        args.tau,
                        args.gamma,
                        args.exploration_noise,
                        args.action_range,
                        reward_normalization=args.rew_norm,
                        ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    writer = SummaryWriter(log_path)

    # if a model exist, continue to train it
    model_path = os.path.join(log_path, 'policy.pth')
    if os.path.exists(model_path):
        policy.load_state_dict(torch.load(model_path))

    def save_fn(policy):
        torch.save(policy.state_dict(), model_path)

    def stop_fn(x):
        return x >= 100

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               save_fn=save_fn,
                               writer=writer)
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        # Let's watch its performance!
        env = gym_make()
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
def train_agent(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[dict, BasePolicy]:
    def env_func():
        return TicTacToeEnv(args.board_size, args.win_size)

    train_envs = VectorEnv([env_func for _ in range(args.training_num)])
    test_envs = VectorEnv([env_func for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)

    policy, optim = get_agents(args,
                               agent_learn=agent_learn,
                               agent_opponent=agent_opponent,
                               optim=optim)

    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size)
    # log
    if not hasattr(args, 'writer'):
        log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn')
        writer = SummaryWriter(log_path)
        args.writer = writer
    else:
        writer = args.writer

    def save_fn(policy):
        if hasattr(args, 'model_save_path'):
            model_save_path = args.model_save_path
        else:
            model_save_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn',
                                           'policy.pth')
        torch.save(policy.policies[args.agent_id - 1].state_dict(),
                   model_save_path)

    def stop_fn(x):
        return x >= args.win_rate

    def train_fn(x):
        policy.policies[args.agent_id - 1].set_eps(args.eps_train)

    def test_fn(x):
        policy.policies[args.agent_id - 1].set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer,
                               test_in_train=False)

    train_collector.close()
    test_collector.close()

    return result, policy.policies[args.agent_id - 1]
Exemple #24
0
def test_sac(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = SubprocVectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       [env.action_space.low[0], env.action_space.high[0]],
                       reward_normalization=True,
                       ignore_done=True)

    # Load expert model.
    assert args.load is not None, 'args.load should not be None'
    expert = deepcopy(policy)
    expert.load_state_dict(
        torch.load(f'{args.logdir}/{args.task}/sac/{args.load}/policy.pth'))
    expert.eval()

    # collector
    expert_collector = Collector(expert, train_envs,
                                 ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    writer = SummaryWriter(f'{args.logdir}/{args.task}/imitation/{args.note}')

    def stop_fn(x):
        return x >= (args.reward_threshold or env.spec.reward_threshold)

    def learner(pol, batch, batch_size, repeat, peer=0.):
        losses, peer_terms, ent_losses = [], [], []
        for _ in range(repeat):
            for b in batch.split(batch_size):
                acts = pol(b).act
                demo = torch.tensor(b.act, dtype=torch.float)
                loss = F.mse_loss(acts, demo)
                if peer != 0:
                    peer_demo = demo[torch.randperm(len(demo))]
                    peer_term = peer * F.mse_loss(acts, peer_demo)
                    loss -= peer_term
                    peer_terms.append(peer_term.detach().cpu.numpy())
                pol.actor_optim.zero_grad()
                loss.backward()
                pol.actor_optim.step()
                losses.append(loss.detach().cpu().numpy())
        return {
            'loss': losses,
            'loss/ent': ent_losses,
            'loss/peer': peer_terms if peer else None,
            'peer': peer,
        }

    # trainer
    result = imitation_trainer(policy,
                               learner,
                               expert_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               1,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               writer=writer,
                               task=args.task,
                               peer=args.peer,
                               peer_decay_steps=args.peer_decay_steps)
    assert stop_fn(result['best_reward'])
    expert_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #25
0
def test_ppo(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    critic = Critic(args.layer_num, args.state_shape,
                    device=args.device).to(args.device)
    optim = torch.optim.Adam(list(actor.parameters()) +
                             list(critic.parameters()),
                             lr=args.lr)
    dist = torch.distributions.Normal
    policy = PPOPolicy(
        actor,
        critic,
        optim,
        dist,
        args.gamma,
        max_grad_norm=args.max_grad_norm,
        eps_clip=args.eps_clip,
        vf_coef=args.vf_coef,
        ent_coef=args.ent_coef,
        reward_normalization=args.rew_norm,
        dual_clip=args.dual_clip,
        value_clip=args.value_clip,
        # action_range=[env.action_space.low[0], env.action_space.high[0]],)
        # if clip the action, ppo would not converge :)
        gae_lambda=args.gae_lambda)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ppo')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = onpolicy_trainer(policy,
                              train_collector,
                              test_collector,
                              args.epoch,
                              args.step_per_epoch,
                              args.collect_per_step,
                              args.repeat_per_collect,
                              args.test_num,
                              args.batch_size,
                              stop_fn=stop_fn,
                              save_fn=save_fn,
                              writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #26
0
def test_dqn(args=get_args()):
    env = FrameStack(AtariPreprocessing(gym.make(args.task), scale_obs=True),
                     4)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # train_envs = gym.make(args.task)
    train_envs = SubprocVectorEnv([
        lambda: FrameStack(
            AtariPreprocessing(gym.make(args.task), scale_obs=True), 4)
        for _ in range(args.training_num)
    ])
    # test_envs = gym.make(args.task)
    test_envs = ShmVecEnv([
        lambda: FrameStack(
            AtariPreprocessing(gym.make(args.task), scale_obs=True), 4)
        for _ in range(args.test_num)
    ])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = DQN(args.state_shape[1], args.state_shape[2], args.action_shape,
              args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       use_target_network=args.target_update_freq > 0,
                       target_update_freq=args.target_update_freq)
    # collector
    train_collector = Collector(policy,
                                train_envs,
                                ReplayBuffer(args.buffer_size),
                                episodic=True)
    test_collector = Collector(policy, test_envs)
    # policy.set_eps(1)
    train_collector.collect(n_step=10000, sampling=True)
    print(len(train_collector.buffer))
    # log
    writer = SummaryWriter(args.logdir + '/' + 'dqn')

    def stop_fn(x):
        if env.env.spec.reward_threshold:
            return x >= env.spec.reward_threshold
        else:
            return False

    def train_fn(x):
        policy.set_eps(args.eps_train)

    def test_fn(x):
        policy.set_eps(args.eps_test)

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               writer=writer,
                               task=args.task)

    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = FrameStack(
            AtariPreprocessing(gym.make(args.task), scale_obs=True), 4)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #27
0
def test_sac():
    args, log_path, writer = get_args()
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = ShmPipeVecEnv([
        lambda: TransformReward(BipedalWrapper(gym.make(args.task)), lambda
                                reward: 5 * reward)
        for _ in range(args.training_num)
    ])
    # test_envs = gym.make(args.task)
    test_envs = ShmPipeVecEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed + 1)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic = DQCritic(args.layer_num, args.state_shape, args.action_shape,
                      args.device).to(args.device)
    critic_target = DQCritic(args.layer_num, args.state_shape,
                             args.action_shape, args.device).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic,
                       critic_optim,
                       critic_target,
                       env.action_space,
                       args.device,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       reward_normalization=args.rew_norm,
                       ignore_done=False)

    if args.mode == 'test':
        policy.load_state_dict(
            torch.load("{}/{}/{}/policy.pth".format(args.logdir, args.task,
                                                    args.comment),
                       map_location=args.device))
        env = gym.make(args.task)
        collector = Collector(policy, env
                              # Monitor(env, 'video', force=True)
                              )
        result = collector.collect(n_episode=10, render=args.render)
        print(
            f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}')
        collector.close()
        exit()
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    train_collector.collect(10000, sampling=True)
    test_collector = Collector(policy, test_envs)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold + 5

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_episode,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])

    pprint.pprint(result)
Exemple #28
0
def test_sac(args=get_args()):
    env = gym.make(args.task)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)

    if args.auto_alpha:
        target_entropy = -np.prod(env.action_space.shape)
        log_alpha = torch.zeros(1, requires_grad=True, device=args.device)
        alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
        alpha = (target_entropy, log_alpha, alpha_optim)
    else:
        alpha = args.alpha

    policy = SACPolicy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       alpha,
                       [env.action_space.low[0], env.action_space.high[0]],
                       reward_normalization=args.rew_norm,
                       ignore_done=True,
                       exploration_noise=OUNoise(0.0, args.noise_std))
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
Exemple #29
0
def test_sac(args=get_args()):
    torch.set_num_threads(1)  # we just need only one thread for NN
    env = gym.make(args.task)
    if args.task == 'Pendulum-v0':
        env.spec.reward_threshold = -250
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    args.max_action = env.action_space.high[0]
    # you can also use tianshou.env.SubprocVectorEnv
    # train_envs = gym.make(args.task)
    train_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.training_num)])
    # test_envs = gym.make(args.task)
    test_envs = VectorEnv(
        [lambda: gym.make(args.task) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    actor = ActorProb(args.layer_num, args.state_shape, args.action_shape,
                      args.max_action, args.device).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic1 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)
    critic2 = Critic(args.layer_num, args.state_shape, args.action_shape,
                     args.device).to(args.device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
    policy = SACPolicy(actor,
                       actor_optim,
                       critic1,
                       critic1_optim,
                       critic2,
                       critic2_optim,
                       args.tau,
                       args.gamma,
                       args.alpha,
                       [env.action_space.low[0], env.action_space.high[0]],
                       reward_normalization=True,
                       ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # train_collector.collect(n_step=args.buffer_size)
    # log
    log_path = os.path.join(args.logdir, args.task, 'sac')
    writer = SummaryWriter(log_path)

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= env.spec.reward_threshold

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    assert stop_fn(result['best_reward'])
    train_collector.close()
    test_collector.close()
    if __name__ == '__main__':
        pprint.pprint(result)
        # Let's watch its performance!
        env = gym.make(args.task)
        collector = Collector(policy, env)
        result = collector.collect(n_episode=1, render=args.render)
        print(f'Final reward: {result["rew"]}, length: {result["len"]}')
        collector.close()
def test_dqn(args=get_args()):
    # env
    task_env = EnvRegister(args.task)
    env = gym.make(task_env)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    rospy.loginfo(args.state_shape)
    rospy.loginfo(args.action_shape)
    train_envs = VectorEnv(
        [lambda: gym.make(task_env) for _ in range(args.training_num)])
    test_envs = VectorEnv(
        [lambda: gym.make(task_env) for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num, args.state_shape, args.action_shape, args.device)
    net = net.to(args.device)
    optim = torch.optim.Adam(net.parameters(), lr=args.lr)
    policy = DQNPolicy(net,
                       optim,
                       args.gamma,
                       args.n_step,
                       use_target_network=args.target_update_freq > 0,
                       target_update_freq=args.target_update_freq)
    # collector
    rospy.loginfo("init collector")
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    train_collector.collect(n_step=args.batch_size)
    # log
    writer = SummaryWriter(args.logdir + '/' + 'dqn')

    rew_record = []

    def stop_fn(x):
        # if x >= 10000:
        for s in x:
            if s.get('reach_goal') == True:
                rew_record.extend(s)
                rospy.loginfo("reach goal times = " + str(len(rew_record)))
                if (len(rew_record) > 1000):
                    return True
                else:
                    return False
            # else:
            #     rew_record.clear()
            #     return False

    def train_fn(x):
        policy.set_eps(args.eps_train, args.eps_decay, args.eps_min)

    def test_fn(x):
        policy.set_eps(args.eps_test, args.eps_decay, args.eps_min)

    # trainer
    rospy.loginfo("start training")
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               train_fn=train_fn,
                               test_fn=test_fn,
                               stop_fn=stop_fn,
                               writer=writer)

    assert stop_fn(result['best_reward'])
    pprint.pprint(result)
    train_collector.close()
    test_collector.close()
    # save network
    torch.save(net, 'ginger_dqn_pathplanning.pkl')

    rospy.loginfo("training finish, testing...")
    # Let's watch its performance!
    env_test = gym.make(task_env)
    net_test = torch.load('ginger_dqn_pathplanning.pkl')
    policy_test = DQNPolicy(net_test,
                            optim,
                            args.gamma,
                            args.n_step,
                            use_target_network=args.target_update_freq > 0,
                            target_update_freq=args.target_update_freq)
    collector = Collector(policy_test, env_test)
    result = collector.collect(n_episode=1, render=args.render)
    rospy.loginfo(f'Final reward: {result["rew"]}, length: {result["len"]}')
    collector.close()