def test_pickling(task): if task in HARD_MODE_CLS_DICT['train']: env = ML1.get_train_tasks(task) else: env = ML1.get_test_tasks(task) env2 = pkl.loads(pkl.dumps(env)) assert len(env._task_names) == 1 assert len(env._task_names) == len(env2._task_names) assert env._task_names[0] == env2._task_names[0] np.testing.assert_equal(env._discrete_goals, env2._discrete_goals)
def test_benchmark_pearl(self): ''' Compare benchmarks between metarl and baselines. :return: ''' env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('reach-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('reach-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) env_id = 'reach-v1' timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = osp.join(os.getcwd(), 'data', 'local', 'benchmarks', 'pearl', timestamp) result_json = {} seeds = random.sample(range(100), params['n_trials']) task_dir = osp.join(benchmark_dir, env_id) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_id)) metarl_csvs = [] for trial in range(params['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_dir = trial_dir + '/metarl' metarl_csv = run_metarl(env, test_env, seed, metarl_dir) metarl_csvs.append(metarl_csv) env.close() benchmark_helper.plot_average_over_trials( [metarl_csvs], ys=['Test/Average/SuccessRate'], plt_file=plt_file, env_id=env_id, x_label='TotalEnvSteps', y_label='Test/Average/SuccessRate', names=['metarl_pearl'], ) factor_val = params['meta_batch_size'] * params['max_path_length'] result_json[env_id] = benchmark_helper.create_json( [metarl_csvs], seeds=seeds, trials=params['n_trials'], xs=['TotalEnvSteps'], ys=['Test/Average/SuccessRate'], factors=[factor_val], names=['metarl_pearl']) Rh.write_file(result_json, 'PEARL')
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv( normalize(ML1.get_train_tasks('push-v1'), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 test_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1')))) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_all_ml1(name): train_env = ML1.get_train_tasks(name) tasks = train_env.sample_tasks(11) for t in tasks: train_env.set_task(t) step_env(train_env, max_path_length=3) train_env.close() del train_env test_env = ML1.get_test_tasks(name) tasks = test_env.sample_tasks(11) for t in tasks: test_env.set_task(t) step_env(test_env, max_path_length=3) test_env.close() del test_env
def __init__(self, max_episode_steps=150, out_of_distribution=False, n_train_tasks=50, n_test_tasks=10, **kwargs): super(ReachML1Env, self).__init__() self.train_env = ML1.get_train_tasks( 'reach-v1', out_of_distribution=out_of_distribution) self.test_env = ML1.get_test_tasks( 'reach-v1', out_of_distribution=out_of_distribution) self.train_tasks = self.train_env.sample_tasks(n_train_tasks) self.test_tasks = self.test_env.sample_tasks(n_test_tasks) self.tasks = self.train_tasks + self.test_tasks self.env = self.train_env #this env will change depending on the idx self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.goal_space_origin = np.array([0, 0.85, 0.175]) self.current_task_idx = 0 self.episode_steps = 0 self._max_episode_steps = max_episode_steps
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) runner = LocalRunner(snapshot_config) obs_dim = int(np.prod(env[0]().observation_space.shape)) action_dim = int(np.prod(env[0]().action_space.shape)) reward_dim = 1 # instantiate networks encoder_in_dim = obs_dim + action_dim + reward_dim encoder_out_dim = params['latent_size'] * 2 net_size = params['net_size'] context_encoder = MLPEncoder(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=[200, 200, 200]) space_a = akro.Box(low=-1, high=1, shape=(obs_dim + params['latent_size'], ), dtype=np.float32) space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) augmented_env = EnvSpec(space_a, space_b) qf1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32) action_space = akro.Box(low=-1, high=1, shape=(params['latent_size'], ), dtype=np.float32) vf_env = EnvSpec(obs_space, action_space) vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) policy = TanhGaussianMLPPolicy2( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) context_conditioned_policy = ContextConditionedPolicy( latent_dim=params['latent_size'], context_encoder=context_encoder, policy=policy, use_ib=params['use_information_bottleneck'], use_next_obs=params['use_next_obs_in_context'], ) pearlsac = PEARLSAC( env=env, test_env=test_env, policy=context_conditioned_policy, qf1=qf1, qf2=qf2, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'], num_evals=params['num_evals'], num_steps_per_eval=params['num_steps_per_eval'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearlsac.to() runner.setup(algo=pearlsac, env=env, sampler_cls=PEARLSampler, sampler_args=dict(max_path_length=params['max_path_length'])) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, num_test_tasks=1, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, max_path_length=50, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() runner = LocalRunner(snapshot_config) runner.setup( algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=params['max_path_length']), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def torch_pearl_ml1_push(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=150, reward_scale=10., use_gpu=False): """Train PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1')))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) worker_args = dict(deterministic=True, accum_context=True) meta_evaluator = MetaEvaluator(test_task_sampler=test_env_sampler, max_path_length=max_path_length, worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=num_test_tasks) pearl.evaluator = meta_evaluator runner.train(n_epochs=num_epochs, batch_size=batch_size)