def ppo_setup(env, trainer, args): policy = GaussianMLPPolicy(env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=OptimizerWrapper( (torch.optim.Adam, dict(lr=args.policy_lr)), policy, max_optimization_epochs=args.n_minibatches, minibatch_size=args.minibatch_size), vf_optimizer=OptimizerWrapper( (torch.optim.Adam, dict(lr=args.vf_lr)), value_function, max_optimization_epochs=args.n_minibatches, minibatch_size=args.minibatch_size), **convert_kwargs(args, PPO)) trainer.setup(algo, env, sampler_cls=LocalSampler, worker_class=VecWorker, worker_args={'n_envs': 8}) return algo
def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') tasks = MetaWorldTaskSampler(ml1, 'train') env = tasks.sample(1)[0]() test_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, n_test_tasks=1, n_exploration_eps=rollouts_per_task) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=tasks, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=rollouts_per_task * env.spec.max_episode_length)
def main(ctxt=None, seed=0): env = GymEnv('Pendulum-v0') print('here env') experts = load_latest_experts(args.experts_dir, n=5) print('here experts') policy = GaussianMLPPolicy(env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, discount=0.99, center_adv=False) trainer = IRLTrainer(ctxt) trainer.setup(algo, env, irl_model, baseline, n_itr=args.n_iter, sampler_cls=MultiprocessingSampler, zero_environment_reward=True) trainer.train(n_epochs=1, batch_size=args.batch_size)
def load_mamlvpg(env_name="MountainCarContinuous-v0"): """Return an instance of the MAML-VPG algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=[64, 64]) vfunc = GaussianMLPValueFunction(env_spec=env.spec) task_sampler = SetTaskSampler( lambda: GarageEnv(normalize(env, expected_action_scale=10.))) max_path_length = 100 meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) algo = MAMLVPG(env=env, policy=policy, value_function=vfunc, max_path_length=max_path_length, meta_batch_size=20, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) return algo
def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) task_sampler = SetTaskSampler(lambda: normalize( GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32)) self.algo = MAMLPPO(env=self.env, policy=self.policy, sampler=None, task_sampler=task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def run_garage_pytorch(env, seed, log_dir): """Create garage PyTorch VPG model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ env = TfEnv(normalize(env)) deterministic.set_seed(seed) runner = LocalRunner(snapshot_config) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) algo = PyTorch_VPG(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv']) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml45 = metaworld.ML45() # pylint: disable=missing-return-doc,missing-return-type-doc def wrap(env, _): return normalize(env, expected_action_scale=10.0) train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap) test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=wrap) env = train_task_sampler.sample(45)[0]() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, task_sampler=train_task_sampler, policy=policy, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def load_ppo(env_name="CartPole-v0"): """Return an instance of the PPO algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) vfunc = GaussianMLPValueFunction(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, value_function=vfunc) return algo
def mtppo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 50. """ set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) assert n_tasks % 50 == 0 assert n_tasks <= 2500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def setup_method(self): """Setup method which is called before every test.""" self._env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'value_function': GaussianMLPValueFunction(env_spec=self._env.spec), 'discount': 0.99, }
def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv('InvertedDoublePendulum-v2')) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec) deterministic.set_seed(0) self.sampler = LocalSampler( agents=self.policy, envs=self.env, max_episode_length=self.env.spec.max_episode_length, is_tf_worker=True)