Esempio n. 1
0
def run_metarl(env, envs, tasks, seed, log_dir):
    """Create metarl Tensorflow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)
    snapshot_config = SnapshotConfig(snapshot_dir=log_dir,
                                     snapshot_mode='gap',
                                     snapshot_gap=10)
    with LocalTFRunner(snapshot_config) as runner:
        policy = GaussianGRUPolicy(
            hidden_dims=hyper_parameters['hidden_sizes'],
            env_spec=env.spec,
            state_include_action=False)

        baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec)

        inner_algo = RL2TRPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=hyper_parameters['max_path_length'] *
            hyper_parameters['rollout_per_task'],
            discount=hyper_parameters['discount'],
            gae_lambda=hyper_parameters['gae_lambda'],
            lr_clip_range=hyper_parameters['lr_clip_range'],
            max_kl_step=0.01,
            optimizer=ConjugateGradientOptimizer,
            optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                base_eps=1e-5)))

        # Need to pass this if meta_batch_size < num_of_tasks
        task_names = list(ML10_ENVS['train'].keys())
        algo = RL2(policy=policy,
                   inner_algo=inner_algo,
                   max_path_length=hyper_parameters['max_path_length'],
                   meta_batch_size=hyper_parameters['meta_batch_size'],
                   task_sampler=tasks,
                   task_names=None
                   if hyper_parameters['meta_batch_size'] >= len(task_names)
                   else task_names)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        text_log_file = osp.join(log_dir, 'debug.log')
        dowel_logger.add_output(dowel.TextOutput(text_log_file))
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(
            algo,
            envs,
            sampler_cls=hyper_parameters['sampler_cls'],
            n_workers=hyper_parameters['meta_batch_size'],
            worker_class=RL2Worker,
            sampler_args=dict(
                use_all_workers=hyper_parameters['use_all_workers']),
            worker_args=dict(
                n_paths_per_trial=hyper_parameters['rollout_per_task']))

        # meta evaluator
        ML_test_envs = [
            TaskIdWrapper(RL2Env(
                env(*ML10_ARGS['test'][task]['args'],
                    **ML10_ARGS['test'][task]['kwargs'])),
                          task_id=task_id,
                          task_name=task)
            for (task_id, (task, env)) in enumerate(ML10_ENVS['test'].items())
        ]
        test_tasks = task_sampler.EnvPoolSampler(ML_test_envs)
        test_tasks.grow_pool(hyper_parameters['n_test_tasks'])

        test_task_names = list(ML10_ENVS['test'].keys())

        runner.setup_meta_evaluator(
            test_task_sampler=test_tasks,
            n_exploration_traj=hyper_parameters['rollout_per_task'],
            n_test_rollouts=hyper_parameters['test_rollout_per_task'],
            n_test_tasks=hyper_parameters['n_test_tasks'],
            n_workers=hyper_parameters['n_test_tasks'],
            test_task_names=None
            if hyper_parameters['n_test_tasks'] >= len(test_task_names) else
            test_task_names)

        runner.train(n_epochs=hyper_parameters['n_itr'],
                     batch_size=hyper_parameters['meta_batch_size'] *
                     hyper_parameters['rollout_per_task'] *
                     hyper_parameters['max_path_length'])

        dowel_logger.remove_all()

        return tabular_log_file
Esempio n. 2
0
    def test_benchmark_rl2(self):  # pylint: disable=no-self-use
        """Compare benchmarks between metarl and baselines."""
        env_id = 'ML10'
        ML_train_envs = [
            TaskIdWrapper(RL2Env(
                env(*ML10_ARGS['train'][task]['args'],
                    **ML10_ARGS['train'][task]['kwargs'])),
                          task_id=task_id,
                          task_name=task)
            for (task_id, (task, env)) in enumerate(ML10_ENVS['train'].items())
        ]
        tasks = task_sampler.EnvPoolSampler(ML_train_envs)
        tasks.grow_pool(hyper_parameters['meta_batch_size'])
        envs = tasks.sample(hyper_parameters['meta_batch_size'])
        env = envs[0]()

        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
        benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp
        result_json = {}

        # Start main loop
        seeds = random.sample(range(100), hyper_parameters['n_trials'])
        task_dir = osp.join(benchmark_dir, env_id)
        metarl_tf_csvs = []

        for trial in range(hyper_parameters['n_trials']):
            seed = seeds[trial]
            trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed)
            metarl_tf_dir = trial_dir + '/metarl'

            with tf.Graph().as_default():
                env.reset()
                metarl_tf_csv = run_metarl(env, envs, tasks, seed,
                                           metarl_tf_dir)

            metarl_tf_csvs.append(metarl_tf_csv)

        with open(osp.join(metarl_tf_dir, 'parameters.txt'), 'w') as outfile:
            hyper_parameters_copy = copy.deepcopy(hyper_parameters)
            hyper_parameters_copy['sampler_cls'] = str(
                hyper_parameters_copy['sampler_cls'])
            json.dump(hyper_parameters_copy, outfile)

        g_x = 'TotalEnvSteps'
        g_ys = [
            'Evaluation/AverageReturn',
            'Evaluation/SuccessRate',
        ]

        for g_y in g_ys:
            plt_file = osp.join(
                benchmark_dir,
                '{}_benchmark_{}.png'.format(env_id, g_y.replace('/', '-')))
            Rh.relplot(g_csvs=metarl_tf_csvs,
                       b_csvs=None,
                       g_x=g_x,
                       g_y=g_y,
                       g_z='MetaRL',
                       b_x=None,
                       b_y=None,
                       b_z='ProMP',
                       trials=hyper_parameters['n_trials'],
                       seeds=seeds,
                       plt_file=plt_file,
                       env_id=env_id,
                       x_label=g_x,
                       y_label=g_y)
Esempio n. 3
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters

    """
    # create multi-task environment and sample tasks
    ML_train_envs = [
        TaskIdWrapper(MetaRLEnv(
            normalize(
                env(*ML10_ARGS['train'][task]['args'],
                    **ML10_ARGS['train'][task]['kwargs']))),
                      task_id=task_id,
                      task_name=task)
        for (task_id, (task, env)) in enumerate(ML10_ENVS['train'].items())
    ]

    ML_test_envs = [
        TaskIdWrapper(MetaRLEnv(
            normalize(
                env(*ML10_ARGS['test'][task]['args'],
                    **ML10_ARGS['test'][task]['kwargs']))),
                      task_id=task_id,
                      task_name=task)
        for (task_id, (task, env)) in enumerate(ML10_ENVS['test'].items())
    ]

    train_task_names = ML10.get_train_tasks()._task_names
    test_task_names = ML10.get_test_tasks()._task_names

    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(params['num_train_tasks'])
    test_env_sampler = EnvPoolSampler(ML_test_envs)
    test_env = test_env_sampler.sample(params['num_test_tasks'])

    runner = LocalRunner(snapshot_config)
    obs_dim = int(np.prod(env[0]().observation_space.shape))
    action_dim = int(np.prod(env[0]().action_space.shape))
    reward_dim = 1

    # instantiate networks
    encoder_in_dim = obs_dim + action_dim + reward_dim
    encoder_out_dim = params['latent_size'] * 2
    net_size = params['net_size']

    context_encoder = MLPEncoder(input_dim=encoder_in_dim,
                                 output_dim=encoder_out_dim,
                                 hidden_sizes=[200, 200, 200])

    space_a = akro.Box(low=-1,
                       high=1,
                       shape=(obs_dim + params['latent_size'], ),
                       dtype=np.float32)
    space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32)
    augmented_env = EnvSpec(space_a, space_b)

    qf1 = ContinuousMLPQFunction(env_spec=augmented_env,
                                 hidden_sizes=[net_size, net_size, net_size])

    qf2 = ContinuousMLPQFunction(env_spec=augmented_env,
                                 hidden_sizes=[net_size, net_size, net_size])

    obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32)
    action_space = akro.Box(low=-1,
                            high=1,
                            shape=(params['latent_size'], ),
                            dtype=np.float32)
    vf_env = EnvSpec(obs_space, action_space)

    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    policy = TanhGaussianMLPPolicy2(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    context_conditioned_policy = ContextConditionedPolicy(
        latent_dim=params['latent_size'],
        context_encoder=context_encoder,
        policy=policy,
        use_ib=params['use_information_bottleneck'],
        use_next_obs=params['use_next_obs_in_context'],
    )

    pearlsac = PEARLSAC(
        env=env,
        test_env=test_env,
        policy=context_conditioned_policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        num_train_tasks=params['num_train_tasks'],
        num_test_tasks=params['num_test_tasks'],
        latent_dim=params['latent_size'],
        meta_batch_size=params['meta_batch_size'],
        num_steps_per_epoch=params['num_steps_per_epoch'],
        num_initial_steps=params['num_initial_steps'],
        num_tasks_sample=params['num_tasks_sample'],
        num_steps_prior=params['num_steps_prior'],
        num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'],
        num_evals=params['num_evals'],
        num_steps_per_eval=params['num_steps_per_eval'],
        batch_size=params['batch_size'],
        embedding_batch_size=params['embedding_batch_size'],
        embedding_mini_batch_size=params['embedding_mini_batch_size'],
        max_path_length=params['max_path_length'],
        reward_scale=params['reward_scale'],
        train_task_names=train_task_names,
        test_task_names=test_task_names,
    )

    tu.set_gpu_mode(params['use_gpu'], gpu_id=0)
    if params['use_gpu']:
        pearlsac.to()

    runner.setup(algo=pearlsac,
                 env=env,
                 sampler_cls=PEARLSampler,
                 sampler_args=dict(max_path_length=params['max_path_length']))
    runner.train(n_epochs=params['num_epochs'],
                 batch_size=params['batch_size'])