def _prepare_meta_env(env): if ML: if env_ind == 2: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('push-v1'), random_init=False)) elif env_ind == 3: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('reach-v1'), random_init=False)) elif env_ind == 4: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('pick-place-v1'), random_init=False)) else: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(env())) return task_samplers.sample(1)[0](), task_samplers
def test_observation_dimension_with_max_obs_dim(self): env = PointEnv() wrapped_env = RL2Env(PointEnv(), max_obs_dim=10) assert wrapped_env.spec.observation_space.shape[ 0] == 10 + env.action_space.shape[0] + 2 obs = wrapped_env.reset() assert 10 + env.action_space.shape[0] + 2 == obs.shape[0] obs, _, _, _ = wrapped_env.step(env.action_space.sample()) assert 10 + env.action_space.shape[0] + 2 == obs.shape[0]
def test_observation_dimension(self): env = PointEnv() wrapped_env = RL2Env(PointEnv()) assert wrapped_env.spec.observation_space.shape[0] == ( env.observation_space.shape[0] + env.action_space.shape[0] + 2) obs = env.reset() obs2 = wrapped_env.reset() assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0] obs, _, _, _ = env.step(env.action_space.sample()) obs2, _, _, _ = wrapped_env.step(env.action_space.sample()) assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0]
def test_benchmark_rl2(self): # pylint: disable=no-self-use """Compare benchmarks between metarl and baselines.""" # test set has a higher max_obs_dim env_obs_dim = [env().observation_space.shape[0] for (_, env) in ML45_ENVS['test'].items()] max_obs_dim = max(env_obs_dim) env_id = 'ML45' ML_train_envs = [ TaskIdWrapper(NormalizedRewardEnv(RL2Env(env(*ML45_ARGS['train'][task]['args'], **ML45_ARGS['train'][task]['kwargs']), max_obs_dim)), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML45_ENVS['train'].items()) ] tasks = task_sampler.EnvPoolSampler(ML_train_envs) tasks.grow_pool(hyper_parameters['meta_batch_size']) envs = tasks.sample(hyper_parameters['meta_batch_size']) env = envs[0]() timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp result_json = {} # Start main loop seeds = random.sample(range(100), hyper_parameters['n_trials']) task_dir = osp.join(benchmark_dir, env_id) metarl_tf_csvs = [] for trial in range(hyper_parameters['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_tf_dir = trial_dir + '/metarl' with tf.Graph().as_default(): env.reset() metarl_tf_csv = run_metarl(env, envs, tasks, seed, metarl_tf_dir) metarl_tf_csvs.append(metarl_tf_csv) with open(osp.join(metarl_tf_dir, 'parameters.txt'), 'w') as outfile: hyper_parameters_copy = copy.deepcopy(hyper_parameters) hyper_parameters_copy['sampler_cls'] = str(hyper_parameters_copy['sampler_cls']) json.dump(hyper_parameters_copy, outfile) g_x = 'TotalEnvSteps' g_ys = [ 'Evaluation/AverageReturn', 'Evaluation/SuccessRate', ] for g_y in g_ys: plt_file = osp.join(benchmark_dir, '{}_benchmark_{}.png'.format(env_id, g_y.replace('/', '-'))) Rh.relplot(g_csvs=metarl_tf_csvs, b_csvs=None, g_x=g_x, g_y=g_y, g_z='MetaRL', b_x=None, b_y=None, b_z='ProMP', trials=hyper_parameters['n_trials'], seeds=seeds, plt_file=plt_file, env_id=env_id, x_label=g_x, y_label=g_y)
def run_metarl(env, envs, tasks, seed, log_dir): """Create metarl Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) with LocalTFRunner(snapshot_config) as runner: policy = GaussianGRUPolicy( hidden_dims=hyper_parameters['hidden_sizes'], env_spec=env.spec, state_include_action=False) baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec) inner_algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'] * hyper_parameters['rollout_per_task'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( max_epochs=hyper_parameters['optimizer_max_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['optimizer_lr'], ), ) ) # Need to pass this if meta_batch_size < num_of_tasks task_names = list(ML45_ENVS['train'].keys()) algo = RL2( policy=policy, inner_algo=inner_algo, max_path_length=hyper_parameters['max_path_length'], meta_batch_size=hyper_parameters['meta_batch_size'], task_sampler=tasks, task_names=None if hyper_parameters['meta_batch_size'] >= len(task_names) else task_names) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') text_log_file = osp.join(log_dir, 'debug.log') dowel_logger.add_output(dowel.TextOutput(text_log_file)) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup( algo, envs, sampler_cls=hyper_parameters['sampler_cls'], n_workers=hyper_parameters['meta_batch_size'], worker_class=RL2Worker, sampler_args=dict( use_all_workers=hyper_parameters['use_all_workers']), worker_args=dict( n_paths_per_trial=hyper_parameters['rollout_per_task'])) # meta evaluator env_obs_dim = [env().observation_space.shape[0] for (_, env) in ML45_ENVS['test'].items()] max_obs_dim = max(env_obs_dim) ML_test_envs = [ TaskIdWrapper(NormalizedRewardEnv(RL2Env(env(*ML45_ARGS['test'][task]['args'], **ML45_ARGS['test'][task]['kwargs']), max_obs_dim)), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML45_ENVS['test'].items()) ] test_tasks = task_sampler.EnvPoolSampler(ML_test_envs) test_tasks.grow_pool(hyper_parameters['n_test_tasks']) test_task_names = list(ML45_ENVS['test'].keys()) runner.setup_meta_evaluator(test_task_sampler=test_tasks, n_exploration_traj=hyper_parameters['rollout_per_task'], n_test_rollouts=hyper_parameters['test_rollout_per_task'], n_test_tasks=hyper_parameters['n_test_tasks'], n_workers=hyper_parameters['n_test_tasks'], test_task_names=None if hyper_parameters['n_test_tasks'] >= len(test_task_names) else test_task_names) runner.train(n_epochs=hyper_parameters['n_itr'], batch_size=hyper_parameters['meta_batch_size'] * hyper_parameters['rollout_per_task'] * hyper_parameters['max_path_length']) dowel_logger.remove_all() return tabular_log_file
def rl2_ppo_halfcheetah(ctxt=None, seed=1): """Train PPO with HalfCheetah environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 # ---- For ML1-push from metaworld.benchmarks import ML1 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=ML1.get_train_tasks('push-v1'))) # ---- For HalfCheetahVel # tasks = task_sampler.SetTaskSampler(lambda: RL2Env( # env=HalfCheetahVelEnv())) env_spec = tasks.sample(1)[0]().spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) inner_algo = RL2PPO( env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) algo = RL2(policy=policy, inner_algo=inner_algo, max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)