Esempio n. 1
0
def trainer(request, session, venv):
    convert_dataset = request.param
    rollouts = types.load(ROLLOUT_PATH)
    data = rollout.flatten_trajectories(rollouts)
    if convert_dataset:
        data = datasets.TransitionsDictDatasetAdaptor(
            data, datasets.EpochOrderDictDataset)
    return bc.BC(venv.observation_space, venv.action_space, expert_data=data)
Esempio n. 2
0
def test_weight_decay_init_error(venv):
    with pytest.raises(ValueError, match=".*weight_decay.*"):
        bc.BC(
            venv.observation_space,
            venv.action_space,
            expert_data=None,
            optimizer_kwargs=dict(weight_decay=1e-4),
        )
Esempio n. 3
0
def test_train_from_random_dict_dataset(venv):
    # make sure that we can construct BC instance & train from a RandomDictDataset
    rollouts = types.load(ROLLOUT_PATH)
    data = rollout.flatten_trajectories(rollouts)
    data = datasets.TransitionsDictDatasetAdaptor(data,
                                                  datasets.RandomDictDataset)
    trainer = bc.BC(venv.observation_space,
                    venv.action_space,
                    expert_data=data)
    trainer.train(n_epochs=1)
Esempio n. 4
0
def train_bc(env, n=0):
    venv = util.make_vec_env(env, n_envs=8)
    if isinstance(venv.action_space, Discrete):
        w = 64
    else:
        w = 256
    for i in range(n):
        mean_rewards = []
        std_rewards = []
        for num_trajs in range(0, 26, 5):
            if num_trajs == 0:
                expert_data = make_sa_dataloader(env, normalize=False)
            else:
                expert_data = make_sa_dataloader(env,
                                                 max_trajs=num_trajs,
                                                 normalize=False)
            bc_trainer = bc.BC(venv.observation_space,
                               venv.action_space,
                               expert_data=expert_data,
                               policy_class=policies.ActorCriticPolicy,
                               ent_weight=0.,
                               l2_weight=0.,
                               policy_kwargs=dict(net_arch=[w, w]))
            if num_trajs > 0:
                bc_trainer.train(n_batches=int(5e5))

            def get_policy(*args, **kwargs):
                return bc_trainer.policy

            model = PPO(get_policy, env, verbose=1)
            model.save(
                os.path.join("learners", env,
                             "bc_{0}_{1}".format(i, num_trajs)))
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.get_env(),
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Trajs: {1}".format(num_trajs, mean_reward))
            np.savez(os.path.join("learners", env, "bc_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
Esempio n. 5
0
    def __init__(
        self,
        env: gym.Env,
        scratch_dir: str,
        beta_schedule: Callable[[int], float] = None,
        batch_size: int = 32,
        **bc_kwargs,
    ):
        """Trainer constructor.

        Args:
            env: environment to train in.
            scratch_dir: directory to use to store intermediate training
                information (e.g. for resuming training).
            beta_schedule: provides a value of `beta` (the probability of taking
                expert action in any given state) at each round of training. If
                `None`, then `linear_beta_schedule` will be used instead.
            batch_size: Number of samples in each batch during BC training.
            **bc_kwargs: additional arguments for constructing the `BC` that
                will be used to train the underlying policy.
        """
        # for pickling
        self._init_args = locals()
        self._init_args.update(bc_kwargs)
        del self._init_args["self"]
        del self._init_args["bc_kwargs"]

        if beta_schedule is None:
            beta_schedule = LinearBetaSchedule(15)
        self.batch_size = batch_size
        self.beta_schedule = beta_schedule
        self.scratch_dir = scratch_dir
        self.env = env
        self.round_num = 0
        self.bc_kwargs = bc_kwargs
        self._last_loaded_round = -1
        self._all_demos = []

        self.bc_trainer = bc.BC(self.env.observation_space,
                                self.env.action_space, **self.bc_kwargs)
Esempio n. 6
0
def trainer(batch_size, venv, expert_data_type):
    rollouts = types.load(ROLLOUT_PATH)
    trans = rollout.flatten_trajectories(rollouts)
    if expert_data_type == "data_loader":
        expert_data = th_data.DataLoader(
            trans,
            batch_size=batch_size,
            shuffle=True,
            collate_fn=types.transitions_collate_fn,
        )
    elif expert_data_type == "ducktyped_data_loader":
        expert_data = DucktypedDataset(trans, batch_size)
    elif expert_data_type == "transitions":
        expert_data = trans
    else:  # pragma: no cover
        raise ValueError(expert_data_type)

    return bc.BC(
        venv.observation_space,
        venv.action_space,
        expert_data=expert_data,
    )
Esempio n. 7
0
# (observation, actions, next_observation) transitions.
transitions = rollout.flatten_trajectories(trajectories)

venv = util.make_vec_env("CartPole-v1", n_envs=2)

tempdir = tempfile.TemporaryDirectory(prefix="quickstart")
tempdir_path = pathlib.Path(tempdir.name)
print(
    f"All Tensorboards and logging are being written inside {tempdir_path}/.")

# Train BC on expert data.
# BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
# dictionaries containing observations and actions.
logger.configure(tempdir_path / "BC/")
bc_trainer = bc.BC(venv.observation_space,
                   venv.action_space,
                   expert_data=transitions)
bc_trainer.train(n_epochs=1)

# Train GAIL on expert data.
# GAIL, and AIRL also accept as `expert_data` any Pytorch-style DataLoader that
# iterates over dictionaries containing observations, actions, and next_observations.
logger.configure(tempdir_path / "GAIL/")
gail_trainer = adversarial.GAIL(
    venv,
    expert_data=transitions,
    expert_batch_size=32,
    gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
)
gail_trainer.train(total_timesteps=2048)
Esempio n. 8
0
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name,
                       env_name):
    # Load pickled expert demonstrations.
    with open(expert_traj_path, "rb") as f:
        # This is a list of `imitation.data.types.Trajectory`, where
        # every instance contains observations and actions for a single expert
        # demonstration.
        trajectories = pickle.load(f)
    # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`.
    # This is a more general dataclass containing unordered
    # (observation, actions, next_observation) transitions.
    transitions = rollout.flatten_trajectories(trajectories)

    venv = util.make_vec_env(env_name, n_envs=2)

    # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name))
    # tempdir_path = pathlib.Path(tempdir.name)
    # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.")
    log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name,
                                             imitation_algo_name)

    if imitation_algo_name == 'BC':
        # Train BC on expert data.
        # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over
        # dictionaries containing observations and actions.
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        trainer = bc.BC(venv.observation_space,
                        venv.action_space,
                        expert_data=transitions)
        trainer.train(n_epochs=100, log_interval=1)

    elif imitation_algo_name == 'GAIL':
        logger.configure(log_path, format_strs=["stdout", "tensorboard"])
        gail_trainer = adversarial.GAIL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
            discrim_kwargs={
                'discrim_net':
                ActObsMLP(
                    action_space=venv.action_space,
                    observation_space=venv.observation_space,
                    hid_sizes=(32, 32),
                )
            })
        gail_trainer.train(total_timesteps=2048)
        trainer = gail_trainer.gen_algo
    elif imitation_algo_name == 'AIRL':
        # Train AIRL on expert data.
        logger.configure(log_path)
        airl_trainer = adversarial.AIRL(
            venv,
            expert_data=transitions,
            expert_batch_size=32,
            gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024),
        )
        airl_trainer.train(total_timesteps=2048)

    sample_until = rollout.min_episodes(15)
    trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until)
    # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path))
    th.save(trainer.policy,
            "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name))

    return trained_ret_mean