def test_benchmark_maml(self, _): # pylint: disable=no-self-use """Compare benchmarks between metarl and baselines.""" timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = './data/local/benchmarks/maml-ml1-push/%s/' % timestamp result_json = {} env_id = 'ML1-Push' meta_env = TaskIdWrapper2(ML1WithPinnedGoal.get_train_tasks('push-v1')) seeds = random.sample(range(100), hyper_parameters['n_trials']) task_dir = osp.join(benchmark_dir, env_id) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_id)) promp_csvs = [] metarl_csvs = [] for trial in range(hyper_parameters['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_dir = trial_dir + '/metarl' promp_dir = trial_dir + '/promp' if test_metarl: # Run metarl algorithm env = MetaRLEnv(normalize(meta_env, expected_action_scale=10.)) metarl_csv = run_metarl(env, seed, metarl_dir) metarl_csvs.append(metarl_csv) env.close()
class TestPPO: """Test class for PPO.""" def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
class TestMAML: """Test class for MAML.""" def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, baseline=self.baseline, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() def test_get_exploration_policy(self, set_params, test_params): """Test if an independent copy of policy is returned.""" self.policy.apply(partial(set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() adapt_policy.apply(partial(set_params, 0.2)) # Old policy should remain untouched self.policy.apply(partial(test_params, 0.1)) adapt_policy.apply(partial(test_params, 0.2)) def test_adapt_policy(self, set_params, test_params): """Test if policy can adapt to samples.""" worker = WorkerFactory(seed=100, max_path_length=100) sampler = LocalSampler.from_worker_factory(worker, self.policy, self.env) self.policy.apply(partial(set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() trajs = sampler.obtain_samples(0, 100, adapt_policy) self.algo.adapt_policy(adapt_policy, trajs) # Old policy should remain untouched self.policy.apply(partial(test_params, 0.1)) # Adapted policy should not be identical to old policy for v1, v2 in zip(adapt_policy.parameters(), self.policy.parameters()): if v1.data.ne(v2.data).sum() > 0: break else: pytest.fail("Parameters of adapted policy should not be " "identical to the old policy.")
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
def test_benchmark_pearl(self): ''' Compare benchmarks between metarl and baselines. :return: ''' env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('reach-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('reach-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) env_id = 'reach-v1' timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = osp.join(os.getcwd(), 'data', 'local', 'benchmarks', 'pearl', timestamp) result_json = {} seeds = random.sample(range(100), params['n_trials']) task_dir = osp.join(benchmark_dir, env_id) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_id)) metarl_csvs = [] for trial in range(params['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_dir = trial_dir + '/metarl' metarl_csv = run_metarl(env, test_env, seed, metarl_dir) metarl_csvs.append(metarl_csv) env.close() benchmark_helper.plot_average_over_trials( [metarl_csvs], ys=['Test/Average/SuccessRate'], plt_file=plt_file, env_id=env_id, x_label='TotalEnvSteps', y_label='Test/Average/SuccessRate', names=['metarl_pearl'], ) factor_val = params['meta_batch_size'] * params['max_path_length'] result_json[env_id] = benchmark_helper.create_json( [metarl_csvs], seeds=seeds, trials=params['n_trials'], xs=['TotalEnvSteps'], ys=['Test/Average/SuccessRate'], factors=[factor_val], names=['metarl_pearl']) Rh.write_file(result_json, 'PEARL')
def setup_method(self): """Setup method which is called before every test.""" self._env = MetaRLEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, }
class TestMAMLPPO: """Test class for MAML-PPO.""" def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLPPO(env=self.env, policy=self.policy, baseline=self.baseline, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, baseline=self.baseline, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def test_benchmark_sac(self): ''' Compare benchmarks between metarl and baselines. :return: ''' mujoco1m = benchmarks.get_benchmark('Mujoco1M') timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = osp.join(os.getcwd(), 'data', 'local', 'benchmarks', 'sac', timestamp) mujoco_tasks = ['HalfCheetah-v2'] for task in mujoco_tasks: env = MetaRLEnv(normalize(gym.make(task))) seeds = [121, 524, 4] task_dir = osp.join(benchmark_dir, task) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(task)) relplt_file = osp.join(benchmark_dir, '{}_benchmark_mean.png'.format(task)) metarl_csvs = [] for trial in range(3): env.reset() seed = seeds[trial] trial_dir = osp.join( task_dir, 'trial_{}_seed_{}'.format(trial + 1, seed)) metarl_dir = osp.join(trial_dir, 'metarl') # Run metarl algorithms metarl_csv = run_metarl(env, seed, metarl_dir) metarl_csvs.append(metarl_csv) env.close()
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = MetaRLEnv(normalize(gym.make('InvertedPendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 10 env.close()
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task.""" runner = LocalRunner(snapshot_config) env = MetaRLEnv(normalize(gym.make('HalfCheetah-v2'))) policy = TanhGaussianMLPPolicy2( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) # replay_buffer = SACReplayBuffer(env_spec=env.spec, # max_size=int(1e6)) sampler_args = { 'agent': policy, 'max_path_length': 1000, } sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1.) runner.setup(algo=sac, env=env, sampler_cls=SimpleSampler, sampler_args=sampler_args) runner.train(n_epochs=1000, batch_size=1000)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ runner = LocalRunner(snapshot_config) env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': 1e-4, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = MetaRLEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) rollouts_per_task = 20 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=40, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=300, batch_size=rollouts_per_task * max_path_length)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) runner = LocalRunner(snapshot_config) obs_dim = int(np.prod(env[0]().observation_space.shape)) action_dim = int(np.prod(env[0]().action_space.shape)) reward_dim = 1 # instantiate networks encoder_in_dim = obs_dim + action_dim + reward_dim encoder_out_dim = params['latent_size'] * 2 net_size = params['net_size'] context_encoder = MLPEncoder(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=[200, 200, 200]) space_a = akro.Box(low=-1, high=1, shape=(obs_dim + params['latent_size'], ), dtype=np.float32) space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) augmented_env = EnvSpec(space_a, space_b) qf1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32) action_space = akro.Box(low=-1, high=1, shape=(params['latent_size'], ), dtype=np.float32) vf_env = EnvSpec(obs_space, action_space) vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) policy = TanhGaussianMLPPolicy2( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) context_conditioned_policy = ContextConditionedPolicy( latent_dim=params['latent_size'], context_encoder=context_encoder, policy=policy, use_ib=params['use_information_bottleneck'], use_next_obs=params['use_next_obs_in_context'], ) pearlsac = PEARLSAC( env=env, test_env=test_env, policy=context_conditioned_policy, qf1=qf1, qf2=qf2, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'], num_evals=params['num_evals'], num_steps_per_eval=params['num_steps_per_eval'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearlsac.to() runner.setup(algo=pearlsac, env=env, sampler_cls=PEARLSampler, sampler_args=dict(max_path_length=params['max_path_length'])) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def test_benchmark_pearl(self): """Run benchmarks for metarl PEARL.""" ML_train_envs = [ TaskIdWrapper(MetaRLEnv( IgnoreDoneWrapper( normalize( env(*ML10_ARGS['train'][task]['args'], **ML10_ARGS['train'][task]['kwargs'])))), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML10_ENVS['train'].items()) ] ML_test_envs = [ TaskIdWrapper(MetaRLEnv( IgnoreDoneWrapper( normalize( env(*ML10_ARGS['test'][task]['args'], **ML10_ARGS['test'][task]['kwargs'])))), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML10_ENVS['test'].items()) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = EnvPoolSampler(ML_test_envs) test_env = test_env_sampler.sample(params['num_test_tasks']) env_id = 'ML10' timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = osp.join(os.getcwd(), 'data', 'local', 'benchmarks', 'pearl', timestamp) result_json = {} seeds = random.sample(range(100), params['n_trials']) task_dir = osp.join(benchmark_dir, env_id) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_id)) metarl_csvs = [] for trial in range(params['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_dir = trial_dir + '/metarl' metarl_csv = run_metarl(env, test_env, seed, metarl_dir) metarl_csvs.append(metarl_csv) env.close() benchmark_helper.plot_average_over_trials( [metarl_csvs], ys=['Test/Average/SuccessRate'], plt_file=plt_file, env_id=env_id, x_label='TotalEnvSteps', y_label='Test/Average/SuccessRate', names=['metarl_pearl'], ) factor_val = params['meta_batch_size'] * params['max_path_length'] result_json[env_id] = benchmark_helper.create_json( [metarl_csvs], seeds=seeds, trials=params['n_trials'], xs=['TotalEnvSteps'], ys=['Test/Average/SuccessRate'], factors=[factor_val], names=['metarl_pearl']) Rh.write_file(result_json, 'PEARL')
class TestVPG: """Test class for VPG.""" @classmethod def setup_class(cls): """Setup method which is called once before all tests in this class.""" deterministic.set_seed(0) def setup_method(self): """Setup method which is called before every test.""" self._env = MetaRLEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, } def teardown_method(self): """Teardown method which is called after every test.""" self._env.close() def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG) def test_invalid_entropy_config(self, algo_param, error, msg): """Test VPG with invalid entropy config.""" self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)