def test_env_pool_sampler(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel from metaworld.envs.mujoco.env_dict import MEDIUM_MODE_ARGS_KWARGS from metaworld.envs.mujoco.env_dict import MEDIUM_MODE_CLS_DICT ML10_ARGS = MEDIUM_MODE_ARGS_KWARGS ML10_ENVS = MEDIUM_MODE_CLS_DICT ML10_train_envs = [ env(*ML10_ARGS['train'][task]['args'], **ML10_ARGS['train'][task]['kwargs']) for (task, env) in ML10_ENVS['train'].items() ] tasks = task_sampler.EnvPoolSampler(ML10_train_envs) assert tasks.n_tasks == 10 updates = tasks.sample(10) for env in ML10_train_envs: assert any(env is update() for update in updates) with pytest.raises(ValueError): tasks.sample(10, with_replacement=True) with pytest.raises(ValueError): tasks.sample(11) tasks.grow_pool(20) tasks.sample(20)
def test_benchmark_rl2(self): # pylint: disable=no-self-use """Compare benchmarks between metarl and baselines.""" # test set has a higher max_obs_dim env_obs_dim = [env().observation_space.shape[0] for (_, env) in ML45_ENVS['test'].items()] max_obs_dim = max(env_obs_dim) env_id = 'ML45' ML_train_envs = [ TaskIdWrapper(NormalizedRewardEnv(RL2Env(env(*ML45_ARGS['train'][task]['args'], **ML45_ARGS['train'][task]['kwargs']), max_obs_dim)), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML45_ENVS['train'].items()) ] tasks = task_sampler.EnvPoolSampler(ML_train_envs) tasks.grow_pool(hyper_parameters['meta_batch_size']) envs = tasks.sample(hyper_parameters['meta_batch_size']) env = envs[0]() timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp result_json = {} # Start main loop seeds = random.sample(range(100), hyper_parameters['n_trials']) task_dir = osp.join(benchmark_dir, env_id) metarl_tf_csvs = [] for trial in range(hyper_parameters['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_tf_dir = trial_dir + '/metarl' with tf.Graph().as_default(): env.reset() metarl_tf_csv = run_metarl(env, envs, tasks, seed, metarl_tf_dir) metarl_tf_csvs.append(metarl_tf_csv) with open(osp.join(metarl_tf_dir, 'parameters.txt'), 'w') as outfile: hyper_parameters_copy = copy.deepcopy(hyper_parameters) hyper_parameters_copy['sampler_cls'] = str(hyper_parameters_copy['sampler_cls']) json.dump(hyper_parameters_copy, outfile) g_x = 'TotalEnvSteps' g_ys = [ 'Evaluation/AverageReturn', 'Evaluation/SuccessRate', ] for g_y in g_ys: plt_file = osp.join(benchmark_dir, '{}_benchmark_{}.png'.format(env_id, g_y.replace('/', '-'))) Rh.relplot(g_csvs=metarl_tf_csvs, b_csvs=None, g_x=g_x, g_y=g_y, g_z='MetaRL', b_x=None, b_y=None, b_z='ProMP', trials=hyper_parameters['n_trials'], seeds=seeds, plt_file=plt_file, env_id=env_id, x_label=g_x, y_label=g_y)
def run_metarl(env, envs, tasks, seed, log_dir): """Create metarl Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='gap', snapshot_gap=10) with LocalTFRunner(snapshot_config) as runner: policy = GaussianGRUPolicy( hidden_dims=hyper_parameters['hidden_sizes'], env_spec=env.spec, state_include_action=False) baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec) inner_algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'] * hyper_parameters['rollout_per_task'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( max_epochs=hyper_parameters['optimizer_max_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['optimizer_lr'], ), ) ) # Need to pass this if meta_batch_size < num_of_tasks task_names = list(ML45_ENVS['train'].keys()) algo = RL2( policy=policy, inner_algo=inner_algo, max_path_length=hyper_parameters['max_path_length'], meta_batch_size=hyper_parameters['meta_batch_size'], task_sampler=tasks, task_names=None if hyper_parameters['meta_batch_size'] >= len(task_names) else task_names) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') text_log_file = osp.join(log_dir, 'debug.log') dowel_logger.add_output(dowel.TextOutput(text_log_file)) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup( algo, envs, sampler_cls=hyper_parameters['sampler_cls'], n_workers=hyper_parameters['meta_batch_size'], worker_class=RL2Worker, sampler_args=dict( use_all_workers=hyper_parameters['use_all_workers']), worker_args=dict( n_paths_per_trial=hyper_parameters['rollout_per_task'])) # meta evaluator env_obs_dim = [env().observation_space.shape[0] for (_, env) in ML45_ENVS['test'].items()] max_obs_dim = max(env_obs_dim) ML_test_envs = [ TaskIdWrapper(NormalizedRewardEnv(RL2Env(env(*ML45_ARGS['test'][task]['args'], **ML45_ARGS['test'][task]['kwargs']), max_obs_dim)), task_id=task_id, task_name=task) for (task_id, (task, env)) in enumerate(ML45_ENVS['test'].items()) ] test_tasks = task_sampler.EnvPoolSampler(ML_test_envs) test_tasks.grow_pool(hyper_parameters['n_test_tasks']) test_task_names = list(ML45_ENVS['test'].keys()) runner.setup_meta_evaluator(test_task_sampler=test_tasks, n_exploration_traj=hyper_parameters['rollout_per_task'], n_test_rollouts=hyper_parameters['test_rollout_per_task'], n_test_tasks=hyper_parameters['n_test_tasks'], n_workers=hyper_parameters['n_test_tasks'], test_task_names=None if hyper_parameters['n_test_tasks'] >= len(test_task_names) else test_task_names) runner.train(n_epochs=hyper_parameters['n_itr'], batch_size=hyper_parameters['meta_batch_size'] * hyper_parameters['rollout_per_task'] * hyper_parameters['max_path_length']) dowel_logger.remove_all() return tabular_log_file
def rl2_ppo_metaworld_ml10(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML10 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: ml10_train_envs = [ RL2Env(mwb.ML10.from_task(task_name)) for task_name in mwb.ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ml10_train_envs) tasks.grow_pool(meta_batch_size) env_spec = ml10_train_envs[0].spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)