def train(env_id, num_timesteps, seed): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() tblog = "/cvgl2/u/surajn/workspace/tb_logs/reacher/" env = make_mujoco_env(env_id, workerseed) model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log) model.learn(total_timesteps=num_timesteps) env.close()
def load(self, path, env): if self.trpo(): return TRPO.load(path, env=env) elif self.ppo(): return PPO2.load(path, env=env) else: return SAC.load(path, env=env)
def __init__(self, policy, env, expert_dataset=None, hidden_size_adversary=100, adversary_entcoeff=1e-3, g_step=3, d_step=1, d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model) self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) self.trpo.using_gail = True self.trpo.expert_dataset = expert_dataset self.trpo.g_step = g_step self.trpo.d_step = d_step self.trpo.d_stepsize = d_stepsize self.trpo.hidden_size_adversary = hidden_size_adversary self.trpo.adversary_entcoeff = adversary_entcoeff self.env = self.trpo.env if _init_setup_model: self.setup_model()
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close()
def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3, expert_dataset=None, save_per_iter=1, checkpoint_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, task_name="task_name", d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model) self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) self.trpo.using_gail = True self.trpo.pretrained_weight = pretrained_weight self.trpo.expert_dataset = expert_dataset self.trpo.save_per_iter = save_per_iter self.trpo.checkpoint_dir = checkpoint_dir self.trpo.g_step = g_step self.trpo.d_step = d_step self.trpo.task_name = task_name self.trpo.d_stepsize = d_stepsize self.trpo.hidden_size_adversary = hidden_size_adversary self.trpo.adversary_entcoeff = adversary_entcoeff if _init_setup_model: self.setup_model()
def create_learner(self, env, parameters): if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv): env = DummyVecEnv([lambda: env]) if self.trpo(): model = TRPO(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = TRPOInterface(model, env.observation_space.shape[0]) elif self.ppo(): model = PPO2(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = PPOInterface(model, env.observation_space.shape[0]) else: model = SAC(SACMlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = SACInterface(model, env.observation_space.shape[0]) if "pretrain_data_path" in parameters: data_path = parameters["pretrain_data_path"] model.pretrain(ExpertDataset(expert_path=data_path, verbose=0), n_epochs=25) return model, interface
def train(params): rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) # setup config if params.get("policy") == 'mlp': policy = MlpPolicy env = gym.make(params.get("environment")) env.configure(envConfig) env.reset() else: policy = CnnPolicy env = gym.make(params.get("environment")) env.configure(CnnNet) env.reset() exp_name = ("{0}_{1}_{2}".format(params.get("model_name"), params.get("policy"), params.get("environment"))) log_dir = './logs/' + exp_name if params.get("seed") > 0: workerseed = params.get("seed"), +10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) # create model model = TRPO(policy, env, verbose=1, tensorboard_log=log_dir, timesteps_per_batch=params.get("timesteps_per_batch"), max_kl=params.get("max_kl"), cg_iters=params.get("cg_iters"), cg_damping=params.get("cg_damping"), entcoeff=params.get("entcoeff"), gamma=params.get("gamma"), lam=params.get("lam"), vf_iters=params.get("vf_iters"), vf_stepsize=params.get("vf_stepsize") # ,policy_kwargs=policy_kwargs ) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) env.close() del env
def main(): """ Runs the test """ """ Create an argparse.ArgumentParser for run_mujoco.py. :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--play', default=False, action='store_true') return parse """ env_id = 'UR5Gripper-v0' model_path = '/tmp/gym/trpo_mpi/' # args = mujoco_arg_parser().parse_args() # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path) env = gym.make(env_id) env = Monitor(env, model_path, allow_early_resets=True) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path) model = model.load(model_path + "trpo.pkl") model.learn(total_timesteps=int(1e5), callback=callback) model.save(model_path + "trpo.pkl") # tf_util.save_state(model_path) # Enjoy trained agent obs = env.reset() for i in range(100): obs = env.reset() env.render() for i in range(200): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def train(env_id, num_timesteps, seed, algorithm, model_save_file=None, log_dir=None): with tf_util.single_threaded_session(): logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv']) workerseed = seed + MPI.COMM_WORLD.Get_rank() env = make_mujoco_env(env_id, workerseed) if algorithm == "TRPO": model = TRPO(MlpPolicy, env, seed=workerseed, verbose=1) else: # Algorithm is PPO model = PPO1(MlpPolicy, env, seed=workerseed, verbose=1) model.learn(total_timesteps=num_timesteps) if model_save_file is not None: model.save(model_save_file) env.close()
verbose=1, seed=seed, avec_coef=1., vf_coef=0., tensorboard_log=log_dir) model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-PPO") # model.learn(total_timesteps=1000000, tb_log_name="tb/PPO") ######################## TRPO ########################### log_dir = "./logs/%s/AVEC-TRPO_%s" % (env_id, seed) # log_dir = "./logs/%s/TRPO_%s" % (env_id, seed) os.makedirs(log_dir, exist_ok=True) env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir) model = TRPO('MlpPolicy', env, verbose=1, avec_coef=1., vf_coef=0., tensorboard_log=log_dir) model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-TRPO") # model.learn(total_timesteps=1000000, tb_log_name="tb/TRPO") ######################### SAC ############################# log_dir = "./logs/%s/AVEC-SAC_%s" % (env_id, seed) # log_dir = "./logs/%s/SAC_%s" % (env_id, seed) os.makedirs(log_dir, exist_ok=True) env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir) model = SAC('CustomSACPolicy', env, verbose=1, avec_coef=1., value_coef=0.,
class GAIL(ActorCriticRLModel): """ Generative Adversarial Imitation Learning (GAIL) .. warning:: Images are not yet handled properly by the current implementation :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param expert_dataset: (ExpertDataset) the dataset manager :param gamma: (float) the discount value :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) :param max_kl: (float) the kullback leiber loss threashold :param cg_iters: (int) the number of iterations for the conjugate gradient calculation :param lam: (float) GAE factor :param entcoeff: (float) the weight for the entropy loss :param cg_damping: (float) the compute gradient dampening factor :param vf_stepsize: (float) the value function stepsize :param vf_iters: (int) the value function's number iterations for learning :param hidden_size: ([int]) the hidden dimension for the MLP :param g_step: (int) number of steps to train policy in each epoch :param d_step: (int) number of steps to train discriminator in each epoch :param d_stepsize: (float) the reward giver stepsize :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly """ def __init__(self, policy, env, expert_dataset=None, hidden_size_adversary=100, adversary_entcoeff=1e-3, g_step=3, d_step=1, d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model) self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) self.trpo.using_gail = True self.trpo.expert_dataset = expert_dataset self.trpo.g_step = g_step self.trpo.d_step = d_step self.trpo.d_stepsize = d_stepsize self.trpo.hidden_size_adversary = hidden_size_adversary self.trpo.adversary_entcoeff = adversary_entcoeff self.env = self.trpo.env if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): pass def pretrain(self, dataset, n_epochs=10, learning_rate=1e-4, adam_epsilon=1e-8, val_interval=None): self.trpo.pretrain(dataset, n_epochs=n_epochs, learning_rate=learning_rate, adam_epsilon=adam_epsilon, val_interval=val_interval) return self def set_env(self, env): self.trpo.set_env(env) self.env = self.trpo.env def setup_model(self): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \ "instance of common.policies.ActorCriticPolicy." self.trpo.setup_model() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL", reset_num_timesteps=True): assert self.trpo.expert_dataset is not None, "You must pass an expert dataset to GAIL for training" self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name, reset_num_timesteps) return self def predict(self, observation, state=None, mask=None, deterministic=False): return self.trpo.predict(observation, state=state, mask=mask, deterministic=deterministic) def action_probability(self, observation, state=None, mask=None, actions=None): return self.trpo.action_probability(observation, state=state, mask=mask, actions=actions) def save(self, save_path): self.trpo.save(save_path) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.trpo.__dict__.update(data) model.trpo.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.trpo.params, params): restores.append(param.assign(loaded_p)) model.trpo.sess.run(restores) return model
total_timesteps=10000, seed=0), lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1 ).learn(total_timesteps=20000, seed=0), lambda e: DeepQ(policy=deepq_models.mlp([32]), batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e).learn(total_timesteps=40000, seed=0), lambda e: PPO1(policy=MlpPolicy, env=e, lam=0.7, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0), lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, lam=0.8 ).learn(total_timesteps=20000, seed=0), lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn( total_timesteps=10000, seed=0), ] @pytest.mark.slow @pytest.mark.parametrize("learn_func", learn_func_list) def test_identity(learn_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator """ env = DummyVecEnv([lambda: IdentityEnv(10)]) model = learn_func(env)
class GAIL(ActorCriticRLModel): """ Generative Adversarial Imitation Learning (GAIL) :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount value :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) :param max_kl: (float) the kullback leiber loss threashold :param cg_iters: (int) the number of iterations for the conjugate gradient calculation :param lam: (float) GAE factor :param entcoeff: (float) the weight for the entropy loss :param cg_damping: (float) the compute gradient dampening factor :param vf_stepsize: (float) the value function stepsize :param vf_iters: (int) the value function's number iterations for learning :param pretrained_weight: (str) the save location for the pretrained weights :param hidden_size: ([int]) the hidden dimension for the MLP :param expert_dataset: (Dset) the dataset manager :param save_per_iter: (int) the number of iterations before saving :param checkpoint_dir: (str) the location for saving checkpoints :param g_step: (int) number of steps to train policy in each epoch :param d_step: (int) number of steps to train discriminator in each epoch :param task_name: (str) the name of the task (can be None) :param d_stepsize: (float) the reward giver stepsize :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3, expert_dataset=None, save_per_iter=1, checkpoint_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, task_name="task_name", d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model) self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) self.trpo.using_gail = True self.trpo.pretrained_weight = pretrained_weight self.trpo.expert_dataset = expert_dataset self.trpo.save_per_iter = save_per_iter self.trpo.checkpoint_dir = checkpoint_dir self.trpo.g_step = g_step self.trpo.d_step = d_step self.trpo.task_name = task_name self.trpo.d_stepsize = d_stepsize self.trpo.hidden_size_adversary = hidden_size_adversary self.trpo.adversary_entcoeff = adversary_entcoeff if _init_setup_model: self.setup_model() def set_env(self, env): super().set_env(env) self.trpo.set_env(env) def setup_model(self): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \ "instance of common.policies.ActorCriticPolicy." assert isinstance( self.action_space, gym.spaces.Box), "Error: GAIL requires a continuous action space." self.trpo.setup_model() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL"): self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name) return self def predict(self, observation, state=None, mask=None, deterministic=False): return self.trpo.predict(observation, state, mask, deterministic=deterministic) def action_probability(self, observation, state=None, mask=None): return self.trpo.action_probability(observation, state, mask) def save(self, save_path): self.trpo.save(save_path) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.trpo.__dict__.update(data) model.trpo.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.trpo.params, params): restores.append(param.assign(loaded_p)) model.trpo.sess.run(restores) return model
def main(env, load_path, fig_path): # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = TRPO.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0, 0, 0] for i in range(100)], [[0, 0, 0, 0] for i in range(100)]] current = [[[0, 0, 0] for i in range(100)], [[0, 0, 0, 0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0, 0, 0] for i in range(100)], [[0, 0, 0, 0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
def main(): """ Runs the test """ parser = mujoco_arg_parser() parser.add_argument( '--model-path', default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model") parser.add_argument('--images', default=False) args = parser.parse_args() logger.configure() if not args.play: model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path, images=args.images) if args.play: def make_env(): env_out = GymWrapper( suite.make( "SawyerLift", use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=True, # make sure we can render to the screen reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth )) env_out.reward_range = None env_out.metadata = None env_out.spec = None env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out #env = make_env() env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = MlpPolicy #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) model.load(args.model_path) logger.log("Running trained model") obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() while True: env.render() actions = model.step(obs)[0] obs[:] = env.step(actions)[0]
import pytest from stable_baselines.a2c import A2C from stable_baselines.ppo1 import PPO1 from stable_baselines.ppo2 import PPO2 from stable_baselines.trpo_mpi import TRPO from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines.common.policies import MlpPolicy MODEL_FUNC_LIST = [ lambda e: A2C(policy=MlpPolicy, env=e), lambda e: PPO1(policy=MlpPolicy, env=e), lambda e: PPO2(policy=MlpPolicy, env=e), lambda e: TRPO(policy=MlpPolicy, env=e), ] @pytest.mark.slow @pytest.mark.parametrize("model_func", MODEL_FUNC_LIST) def test_identity_multidiscrete(model_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multidiscrete action space :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator """ env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) model = model_func(env)