def agent_PPO(prompt, num_legal_actions, num_possible_obs):
    dummy_env.set_meta(num_legal_actions, num_possible_obs)
    meta = (num_legal_actions, num_possible_obs)
    num_observs = (len(prompt) + 1) / 3
    train_on_len = 3 * pow(2, int(log2(num_observs))) - 1
    train_on = prompt[:train_on_len]

    if not ((train_on, meta) in cache_PPO):
        rewards = [train_on[i + 0] for i in range(0, train_on_len, 3)]
        observs = [train_on[i + 1] for i in range(0, train_on_len, 3)]
        dummy_env.set_rewards_and_observs(rewards, observs)
        n_steps = len(rewards) - 1

        if n_steps < 2:
            return 0

        A = SBL3.PPO('MlpPolicy',
                     dummy_env,
                     n_steps=n_steps,
                     batch_size=n_steps,
                     seed=0)

        A.learn(len(rewards) - 1)
        cache_PPO[(train_on, meta)] = A
    else:
        A = cache_PPO[(train_on, meta)]

    action, _ = A.predict(prompt[-1])
    return action
    def test_stable_PPO(env_name, request):
        env = request.getfixturevalue(env_name)
        model = stable_baselines3.PPO('MlpPolicy',
                                      env,
                                      verbose=1,
                                      learning_rate=.0003,
                                      n_steps=2048,
                                      batch_size=64,
                                      n_epochs=10,
                                      gamma=.99,
                                      gae_lambda=.95,
                                      clip_range=.2,
                                      ent_coef=0,
                                      vf_coef=.5,
                                      max_grad_norm=.5)

        model.learn(total_timesteps=TIMESTEPS)

        # Check model state
        assert model.action_space == env.action_space
        assert model.env.action_space == env.action_space

        assert isinstance(model.policy,
                          stable_baselines3.common.policies.ActorCriticPolicy)

        # Check model works

        obs = env.reset()
        a, _ = model.predict(obs)
        obs, reward, done, info = env.step(a)

        assert reward is not None and reward < 0
        assert a is not None
        assert isinstance(done, bool)
        assert info['timestep'] == 1

        env.close()
Exemple #3
0
# Train BC on expert data.
# BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
# dictionaries containing observations and actions.
logger.configure(tempdir_path / "BC/")
bc_trainer = bc.BC(venv.observation_space,
                   venv.action_space,
                   expert_data=transitions)
bc_trainer.train(n_epochs=1)

# Train GAIL on expert data.
# GAIL, and AIRL also accept as `expert_data` any Pytorch-style DataLoader that
# iterates over dictionaries containing observations, actions, and next_observations.
logger.configure(tempdir_path / "GAIL/")
gail_trainer = adversarial.GAIL(
    venv,
    expert_data=transitions,
    expert_batch_size=32,
    gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
)
gail_trainer.train(total_timesteps=2048)

# Train AIRL on expert data.
logger.configure(tempdir_path / "AIRL/")
airl_trainer = adversarial.AIRL(
    venv,
    expert_data=transitions,
    expert_batch_size=32,
    gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
)
airl_trainer.train(total_timesteps=2048)
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name,
                       env_name):
    # Load pickled expert demonstrations.
    with open(expert_traj_path, "rb") as f:
        # This is a list of `imitation.data.types.Trajectory`, where
        # every instance contains observations and actions for a single expert
        # demonstration.
        trajectories = pickle.load(f)
    # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`.
    # This is a more general dataclass containing unordered
    # (observation, actions, next_observation) transitions.
    transitions = rollout.flatten_trajectories(trajectories)

    venv = util.make_vec_env(env_name, n_envs=2)

    # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name))
    # tempdir_path = pathlib.Path(tempdir.name)
    # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.")
    log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name,
                                             imitation_algo_name)

    if imitation_algo_name == 'BC':
        # Train BC on expert data.
        # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
        # dictionaries containing observations and actions.
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        trainer = bc.BC(venv.observation_space,
                        venv.action_space,
                        expert_data=transitions)
        trainer.train(n_epochs=100, log_interval=1)

    elif imitation_algo_name == 'GAIL':
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        gail_trainer = adversarial.GAIL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
            discrim_kwargs={
                'discrim_net':
                ActObsMLP(
                    action_space=venv.action_space,
                    observation_space=venv.observation_space,
                    hid_sizes=(32, 32),
                )
            })
        gail_trainer.train(total_timesteps=2048)
        trainer = gail_trainer.gen_algo
    elif imitation_algo_name == 'AIRL':
        # Train AIRL on expert data.
        logger.configure(log_path)
        airl_trainer = adversarial.AIRL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
        )
        airl_trainer.train(total_timesteps=2048)

    sample_until = rollout.min_episodes(15)
    trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path))
    th.save(trainer.policy,
            "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name))

    return trained_ret_mean
Exemple #5
0
# Train BC on expert data.
# BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
# dictionaries containing observations and actions.
logger.configure(tempdir_path / "BC/")
bc_trainer = bc.BC(venv.observation_space,
                   venv.action_space,
                   expert_data=transitions)
bc_trainer.train(n_epochs=1)

# Train GAIL on expert data.
# GAIL, and AIRL also accept as `expert_data` any Pytorch-style DataLoader that
# iterates over dictionaries containing observations, actions, and next_observations.
logger.configure(tempdir_path / "GAIL/")
gail_trainer = adversarial.GAIL(
    venv,
    expert_data=transitions,
    expert_batch_size=32,
    gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1),
)
gail_trainer.train(total_timesteps=2048)

# Train AIRL on expert data.
logger.configure(tempdir_path / "AIRL/")
airl_trainer = adversarial.AIRL(
    venv,
    expert_data=transitions,
    expert_batch_size=32,
    gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1),
)
airl_trainer.train(total_timesteps=2048)