コード例 #1
0
    def test_baseline(self):
        """Test the baseline initialization."""
        box_env = TfEnv(DummyBoxEnv())
        deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env)
        gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env)

        self.sess.run(tf.compat.v1.global_variables_initializer())
        deterministic_mlp_baseline.get_param_values()
        gaussian_mlp_baseline.get_param_values()

        box_env.close()
コード例 #2
0
 def test_get_params_internal(self, obs_dim):
     box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
     with mock.patch(('metarl.tf.baselines.'
                      'gaussian_mlp_baseline.'
                      'GaussianMLPRegressor'),
                     new=SimpleGaussianMLPRegressor):
         gmb = GaussianMLPBaseline(env_spec=box_env.spec,
                                   regressor_args=dict())
     params_interal = gmb.get_params_internal()
     trainable_params = tf.compat.v1.trainable_variables(
         scope='GaussianMLPBaseline')
     assert np.array_equal(params_interal, trainable_params)
コード例 #3
0
ファイル: mt50_trpo.py プロジェクト: seba-1511/metarl
def trpo_mt50(ctxt=None, seed=1):

    """Run task."""
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = MultiEnvWrapper(MT50_envs, env_ids, sample_strategy=round_robin_strategy)

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

        # baseline = LinearFeatureBaseline(env_spec=env.spec)
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
            ),
        )

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=150,
                    discount=0.99,
                    gae_lambda=0.97,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=1500, batch_size=len(MT50_envs)*10*150)
コード例 #4
0
 def test_process_samples_continuous_recurrent(self):
     env = TfEnv(DummyBoxEnv())
     policy = GaussianLSTMPolicy(env_spec=env.spec)
     baseline = GaussianMLPBaseline(env_spec=env.spec)
     max_path_length = 100
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = BatchPolopt2(env_spec=env.spec,
                             policy=policy,
                             baseline=baseline,
                             max_path_length=max_path_length,
                             flatten_input=True)
         runner.setup(algo, env, sampler_args=dict(n_envs=1))
         runner.train(n_epochs=1, batch_size=max_path_length)
         paths = runner.obtain_samples(0)
         samples = algo.process_samples(0, paths)
         # Since there is only 1 vec_env in the sampler and DummyBoxEnv
         # never terminate until it reaches max_path_length, batch size
         # must be max_path_length, i.e. 100
         assert samples['observations'].shape == (
             max_path_length, env.observation_space.flat_dim)
         assert samples['actions'].shape == (max_path_length,
                                             env.action_space.flat_dim)
         assert samples['rewards'].shape == (max_path_length, )
         assert samples['baselines'].shape == (max_path_length, )
         assert samples['returns'].shape == (max_path_length, )
         # there is only 1 path
         assert samples['lengths'].shape == (1, )
         for key, shape in policy.state_info_specs:
             assert samples['agent_infos'][key].shape == (max_path_length,
                                                          np.prod(shape))
         # DummyBoxEnv has env_info dummy
         assert samples['env_infos']['dummy'].shape == (max_path_length, )
         assert isinstance(samples['average_return'], float)
コード例 #5
0
 def test_ppo_pendulum_gru(self):
     """Test PPO with Pendulum environment and recurrent policy."""
     with LocalTFRunner(snapshot_config) as runner:
         env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
         gru_policy = GaussianGRUPolicy(env_spec=env.spec)
         baseline = GaussianMLPBaseline(
             env_spec=env.spec,
             regressor_args=dict(hidden_sizes=(32, 32)),
         )
         algo = PPO(
             env_spec=env.spec,
             policy=gru_policy,
             baseline=baseline,
             max_path_length=100,
             discount=0.99,
             gae_lambda=0.95,
             lr_clip_range=0.2,
             optimizer_args=dict(
                 batch_size=32,
                 max_epochs=10,
             ),
             stop_entropy_gradient=True,
             entropy_method='max',
             policy_ent_coeff=0.02,
             center_adv=False,
         )
         runner.setup(algo, env)
         last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 80
コード例 #6
0
def trpo_ml1(ctxt=None, seed=1):
    """Run task."""
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        Ml1_reach_envs = get_ML1_envs_test(env_id)
        env = MTMetaWorldWrapper(Ml1_reach_envs)

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(64, 64), use_trust_region=False),
        )

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=150,
                    discount=0.99,
                    gae_lambda=0.97,
                    max_kl_step=0.01)

        timesteps = 6000000
        batch_size = 150 * env.num_tasks
        epochs = timesteps // batch_size

        print(f'epochs: {epochs}, batch_size: {batch_size}')

        runner.setup(algo, env, sampler_args={'n_envs': 1})
        runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
コード例 #7
0
    def test_fit(self, obs_dim):
        box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_mlp_baseline.'
                         'GaussianMLPRegressor'),
                        new=SimpleGaussianMLPRegressor):
            gmb = GaussianMLPBaseline(env_spec=box_env.spec)
        paths = [{
            'observations': [np.full(obs_dim, 1)],
            'returns': [1]
        }, {
            'observations': [np.full(obs_dim, 2)],
            'returns': [2]
        }]
        gmb.fit(paths)

        obs = {'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)]}
        prediction = gmb.predict(obs)
        assert np.array_equal(prediction, [1, 2])
コード例 #8
0
def gaussian_gru_policy(ctxt, env_id, seed):
    """Create Gaussian GRU Policy on TF-PPO.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = MetaRLEnv(normalize(gym.make(env_id)))

        policy = GaussianGRUPolicy(
            env_spec=env.spec,
            hidden_dim=32,
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    learning_rate=1e-3,
                ),
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                learning_rate=1e-3,
            ),
        )

        runner.setup(algo, env, sampler_args=dict(n_envs=12))
        runner.train(n_epochs=5, batch_size=2048)
コード例 #9
0
ファイル: ppo_pendulum.py プロジェクト: seba-1511/metarl
def tf_ppo_pendulum(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=True,
            ),
        )

        # NOTE: make sure when setting entropy_method to 'max', set
        # center_adv to False and turn off policy gradient. See
        # tf.algos.NPO for detailed documentation.
        algo = RL2PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                learning_rate=1e-3,
            ),
            stop_entropy_gradient=True,
            entropy_method='max',
            policy_ent_coeff=0.002,
            center_adv=False,
        )

        runner.setup(algo, env)

        runner.train(n_epochs=120, batch_size=4096, plot=False)
コード例 #10
0
 def setup_method(self):
     super().setup_method()
     self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
コード例 #11
0
    def test_is_pickleable(self):
        box_env = MetaRLEnv(DummyBoxEnv(obs_dim=(1, )))
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_mlp_baseline.'
                         'GaussianMLPRegressor'),
                        new=SimpleGaussianMLPRegressor):
            gmb = GaussianMLPBaseline(env_spec=box_env.spec)
        obs = {'observations': [np.full(1, 1), np.full(1, 1)]}

        with tf.compat.v1.variable_scope('GaussianMLPBaseline', reuse=True):
            return_var = tf.compat.v1.get_variable(
                'SimpleGaussianMLPModel/return_var')
        return_var.load(1.0)

        prediction = gmb.predict(obs)

        h = pickle.dumps(gmb)

        with tf.compat.v1.Session(graph=tf.Graph()):
            gmb_pickled = pickle.loads(h)
            prediction2 = gmb_pickled.predict(obs)

            assert np.array_equal(prediction, prediction2)
コード例 #12
0
ファイル: ml1_all_ppo.py プロジェクト: seba-1511/metarl
def ppo_ml1(ctxt=None, seed=1):
    """Run task."""
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        Ml1_reach_envs = get_ML1_envs_test(env_id)
        env = MTMetaWorldWrapper(Ml1_reach_envs)

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
            hidden_w_init=tf.constant_initializer(np.sqrt(2)),
            hidden_b_init=tf.constant_initializer(np.sqrt(2)),
        )

        # baseline = LinearFeatureBaseline(env_spec=env.spec)
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
                hidden_w_init=tf.constant_initializer(np.sqrt(2)),
                hidden_b_init=tf.constant_initializer(np.sqrt(2)),
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=150,
            discount=0.99,
            gae_lambda=0.97,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=30,
                max_epochs=4,
                tf_optimizer_args=dict(learning_rate=3e-4, ),
            ),
        )

        timesteps = 6000000
        batch_size = 150 * env.num_tasks
        epochs = timesteps // batch_size

        print(f'epochs: {epochs}, batch_size: {batch_size}')

        runner.setup(algo, env, sampler_args={'n_envs': 1})
        runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
コード例 #13
0
    def run_task(self, snapshot_config, *_):
        config = tf.ConfigProto(device_count={'GPU': 0},
                                allow_soft_placement=True,
                                intra_op_parallelism_threads=12,
                                inter_op_parallelism_threads=12)
        sess = tf.Session(config=config)
        with LocalTFRunner(snapshot_config=snapshot_config,
                           sess=sess) as runner:
            env = gym.make(self._env)
            env = TfEnv(normalize(env))
            env.reset()
            policy = GaussianGRUPolicy(
                env_spec=env.spec,
                hidden_dim=32,
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )

            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(
                    hidden_sizes=(64, 64),
                    use_trust_region=False,
                    optimizer=FirstOrderOptimizer,
                    optimizer_args=dict(
                        batch_size=32,
                        max_epochs=10,
                        tf_optimizer_args=dict(learning_rate=1e-3),
                    ),
                ),
            )

            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                policy_ent_coeff=0.0,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            )
            runner.setup(algo, env, sampler_args=dict(n_envs=12))
            runner.train(n_epochs=5, batch_size=2048)
コード例 #14
0
    def test_param_values(self, obs_dim):
        box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_mlp_baseline.'
                         'GaussianMLPRegressor'),
                        new=SimpleGaussianMLPRegressor):
            gmb = GaussianMLPBaseline(env_spec=box_env.spec)
            new_gmb = GaussianMLPBaseline(env_spec=box_env.spec,
                                          name='GaussianMLPBaseline2')

        # Manual change the parameter of GaussianMLPBaseline
        with tf.compat.v1.variable_scope('GaussianMLPBaseline', reuse=True):
            return_var = tf.compat.v1.get_variable(
                'SimpleGaussianMLPModel/return_var')
        return_var.load(1.0)

        old_param_values = gmb.get_param_values()
        new_param_values = new_gmb.get_param_values()
        assert not np.array_equal(old_param_values, new_param_values)
        new_gmb.set_param_values(old_param_values)
        new_param_values = new_gmb.get_param_values()
        assert np.array_equal(old_param_values, new_param_values)
コード例 #15
0
def ppo_mt10_sampling(ctxt=None, seed=1):

    """Run task."""
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = MultiEnvSamplingWrapper(MT10_envs, env_ids, len(env_ids)-skip_size, sample_strategy=round_robin_strategy)

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=150,
            discount=0.99,
            gae_lambda=0.97,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(
                    learning_rate=3e-4,
                ),
            ),
        )

        batch_size = (len(env_ids)-skip_size)*10*150
        epochs = (total_steps//batch_size)+10

        print ("epochs:", epochs, "batch_size:", batch_size)

        runner.setup(algo, env)
        runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
コード例 #16
0
def run_metarl_tf(env, seed, log_dir):
    """Create metarl TensorFlow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        policy = TF_GMP(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        # baseline = LinearFeatureBaseline(env_spec=env.spec)
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=hyper_parameters['hidden_sizes'],
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=hyper_parameters['training_batch_size'],
                    max_epochs=hyper_parameters['training_epochs'],
                    tf_optimizer_args=dict(
                        learning_rate=hyper_parameters['learning_rate'], ),
                ),
            ),
        )

        algo = TF_PPO(env_spec=env.spec,
                      policy=policy,
                      baseline=baseline,
                      max_path_length=hyper_parameters['max_path_length'],
                      discount=hyper_parameters['discount'],
                      gae_lambda=hyper_parameters['gae_lambda'],
                      center_adv=hyper_parameters['center_adv'],
                      policy_ent_coeff=hyper_parameters['policy_ent_coeff'],
                      lr_clip_range=hyper_parameters['lr_clip_range'],
                      optimizer_args=dict(
                          batch_size=hyper_parameters['training_batch_size'],
                          max_epochs=hyper_parameters['training_epochs'],
                          tf_optimizer_args=dict(
                              learning_rate=hyper_parameters['learning_rate'])))  # yapf: disable

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file