コード例 #1
0
    def test_session(self):
        with LocalTFRunner(snapshot_config):
            assert tf.compat.v1.get_default_session() is not None, (
                'LocalTFRunner() should provide a default tf session.')

        sess = tf.compat.v1.Session()
        with LocalTFRunner(snapshot_config, sess=sess):
            assert tf.compat.v1.get_default_session() is sess, (
                'LocalTFRunner(sess) should use sess as default session.')
コード例 #2
0
 def test_ppo_pendulum_gru(self):
     """Test PPO with Pendulum environment and recurrent policy."""
     with LocalTFRunner(snapshot_config) as runner:
         env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
         gru_policy = GaussianGRUPolicy(env_spec=env.spec)
         baseline = GaussianMLPBaseline(
             env_spec=env.spec,
             regressor_args=dict(hidden_sizes=(32, 32)),
         )
         algo = PPO(
             env_spec=env.spec,
             policy=gru_policy,
             baseline=baseline,
             max_path_length=100,
             discount=0.99,
             gae_lambda=0.95,
             lr_clip_range=0.2,
             optimizer_args=dict(
                 batch_size=32,
                 max_epochs=10,
             ),
             stop_entropy_gradient=True,
             entropy_method='max',
             policy_ent_coeff=0.02,
             center_adv=False,
         )
         runner.setup(algo, env)
         last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 80
コード例 #3
0
 def test_ppo_pendulum_flatten_input(self):
     """Test PPO with CartPole to test observation flattening."""
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         env = TfEnv(
             normalize(ReshapeObservation(gym.make('CartPole-v1'), (2, 2))))
         policy = CategoricalMLPPolicy(
             env_spec=env.spec,
             hidden_nonlinearity=tf.nn.tanh,
         )
         baseline = LinearFeatureBaseline(env_spec=env.spec)
         algo = PPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0,
                    optimizer_args=dict(
                        batch_size=32,
                        max_epochs=10,
                        tf_optimizer_args=dict(learning_rate=1e-3),
                    ))
         runner.setup(algo, env)
         last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 80
コード例 #4
0
def run_task(snapshot_config, *_):
    """Run the job.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): Configuration
            values for snapshotting.
        *_ (object): Hyperparameters (unused).

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=ISSampler,
                     sampler_args=dict(n_backtrack=1))
        runner.train(n_epochs=200, batch_size=4000)
コード例 #5
0
    def test_cem_cartpole(self):
        """Test CEM with Cartpole-v1 environment."""
        with LocalTFRunner(snapshot_config) as runner:
            env = TfEnv(env_name='CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            n_samples = 10

            algo = CEM(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       best_frac=0.1,
                       max_path_length=100,
                       n_samples=n_samples)

            runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler)
            rtn = runner.train(n_epochs=10, batch_size=2048)
            assert rtn > 40

            env.close()
コード例 #6
0
    def test_rl2_sampler_less_envs_than_meta_batch(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            policy = GaussianMLPPolicy(env_spec=self.env.spec,
                                       hidden_sizes=[32, 32])

            baseline = LinearFeatureBaseline(env_spec=self.env.spec)

            algo = PPO(env_spec=self.env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=self.max_path_length,
                       discount=0.99)
            runner.setup(algo,
                         env=self.env,
                         sampler_cls=RL2Sampler,
                         sampler_args=dict(
                             meta_batch_size=self.meta_batch_size,
                             n_envs=self.meta_batch_size // 2))
            runner._start_worker()
            assert isinstance(runner._sampler, RL2Sampler)
            assert runner._sampler._envs_per_worker == 1
            all_indices = np.arange(self.meta_batch_size)
            for i in range(self.meta_batch_size // 2):
                assert all(runner._sampler._vec_envs_indices[i] ==
                           all_indices[i * 2:i * 2 + 2])
            paths = runner._sampler.obtain_samples(0)
            assert len(paths) == self.meta_batch_size
            assert len(paths[0]['observations']) == self.max_path_length
            paths = runner._sampler.obtain_samples(
                0, self.meta_batch_size * 10 * self.max_path_length)
            assert len(paths) == self.meta_batch_size * 10
            assert len(paths[0]['observations']) == self.max_path_length
コード例 #7
0
ファイル: cma_es_cartpole.py プロジェクト: seba-1511/metarl
def run_task(snapshot_config, *_):
    """Train CMA_ES with Cartpole-v1 environment.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
        *_ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        n_samples = 20

        algo = CMAES(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     max_path_length=100,
                     n_samples=n_samples)

        runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler)
        runner.train(n_epochs=100, batch_size=1000)
コード例 #8
0
def trpo_ml1(ctxt=None, seed=1):
    """Run task."""
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        Ml1_reach_envs = get_ML1_envs_test(env_id)
        env = MTMetaWorldWrapper(Ml1_reach_envs)

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(64, 64), use_trust_region=False),
        )

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=150,
                    discount=0.99,
                    gae_lambda=0.97,
                    max_kl_step=0.01)

        timesteps = 6000000
        batch_size = 150 * env.num_tasks
        epochs = timesteps // batch_size

        print(f'epochs: {epochs}, batch_size: {batch_size}')

        runner.setup(algo, env, sampler_args={'n_envs': 1})
        runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
コード例 #9
0
ファイル: mt50_trpo.py プロジェクト: seba-1511/metarl
def trpo_mt50(ctxt=None, seed=1):

    """Run task."""
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = MultiEnvWrapper(MT50_envs, env_ids, sample_strategy=round_robin_strategy)

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

        # baseline = LinearFeatureBaseline(env_spec=env.spec)
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
            ),
        )

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=150,
                    discount=0.99,
                    gae_lambda=0.97,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=1500, batch_size=len(MT50_envs)*10*150)
コード例 #10
0
ファイル: multi_env_trpo.py プロジェクト: seba-1511/metarl
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.

        _ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.))))
        env2 = TfEnv(normalize(PointEnv(goal=(1., 0.))))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=2048, plot=False)
コード例 #11
0
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): Configuration
            values for snapshotting.
        *_ (object): Hyperparameters (unused).

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('Swimmer-v2'))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=RaySampler,
                     sampler_args={'seed': seed})
        runner.train(n_epochs=40, batch_size=4000)
コード例 #12
0
ファイル: erwr_cartpole.py プロジェクト: seba-1511/metarl
def erwr_cartpole(ctxt=None, seed=1):
    """Train with ERWR on CartPole-v1 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = ERWR(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99)

        runner.setup(algo=algo, env=env)

        runner.train(n_epochs=100, batch_size=10000, plot=False)
コード例 #13
0
 def test_process_samples_discrete_non_recurrent(self):
     env = TfEnv(DummyDiscreteEnv())
     policy = CategoricalMLPPolicy(env_spec=env.spec)
     baseline = LinearFeatureBaseline(env_spec=env.spec)
     max_path_length = 100
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = BatchPolopt2(env_spec=env.spec,
                             policy=policy,
                             baseline=baseline,
                             max_path_length=max_path_length,
                             flatten_input=True)
         runner.setup(algo, env, sampler_args=dict(n_envs=1))
         runner.train(n_epochs=1, batch_size=max_path_length)
         paths = runner.obtain_samples(0)
         samples = algo.process_samples(0, paths)
         # Since there is only 1 vec_env in the sampler and DummyDiscreteEnv
         # always terminate, number of paths must be max_path_length, and
         # batch size must be max_path_length as well, i.e. 100
         assert samples['observations'].shape == (
             max_path_length, env.observation_space.flat_dim)
         assert samples['actions'].shape == (max_path_length,
                                             env.action_space.n)
         assert samples['rewards'].shape == (max_path_length, )
         assert samples['baselines'].shape == (max_path_length, )
         assert samples['returns'].shape == (max_path_length, )
         # there is 100 path
         assert samples['lengths'].shape == (max_path_length, )
         # non-recurrent policy has empty agent info
         assert samples['agent_infos'] == {}
         # non-recurrent policy has empty env info
         assert samples['env_infos'] == {}
         assert isinstance(samples['average_return'], float)
コード例 #14
0
def run_metarl(env, seed, log_dir):
    '''
    Create metarl model and training.
    Replace the ddpg with the algorithm you want to run.
    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))
        # Set up params for ddpg
        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=params['qf_hidden_sizes'],
                                    hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'])

        ddpg = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    steps_per_epoch=params['steps_per_epoch'],
                    policy_lr=params['policy_lr'],
                    qf_lr=params['qf_lr'],
                    target_update_tau=params['tau'],
                    n_train_steps=params['n_train_steps'],
                    discount=params['discount'],
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    policy_optimizer=tf.train.AdamOptimizer,
                    qf_optimizer=tf.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        tensorboard_log_dir = osp.join(log_dir)
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir))

        runner.setup(ddpg, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
コード例 #15
0
ファイル: test_ddpg.py プロジェクト: seba-1511/metarl
    def test_ddpg_double_pendulum(self):
        """Test DDPG with Pendulum environment."""
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(gym.make('InvertedDoublePendulum-v2'))
            action_noise = OUStrategy(env.spec, sigma=0.2)
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e5),
                                               time_horizon=100)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                steps_per_epoch=20,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(5e3),
                exploration_strategy=action_noise,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=100)
            assert last_avg_ret > 60

            env.close()
コード例 #16
0
 def test_process_samples_continuous_recurrent(self):
     env = TfEnv(DummyBoxEnv())
     policy = GaussianLSTMPolicy(env_spec=env.spec)
     baseline = GaussianMLPBaseline(env_spec=env.spec)
     max_path_length = 100
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = BatchPolopt2(env_spec=env.spec,
                             policy=policy,
                             baseline=baseline,
                             max_path_length=max_path_length,
                             flatten_input=True)
         runner.setup(algo, env, sampler_args=dict(n_envs=1))
         runner.train(n_epochs=1, batch_size=max_path_length)
         paths = runner.obtain_samples(0)
         samples = algo.process_samples(0, paths)
         # Since there is only 1 vec_env in the sampler and DummyBoxEnv
         # never terminate until it reaches max_path_length, batch size
         # must be max_path_length, i.e. 100
         assert samples['observations'].shape == (
             max_path_length, env.observation_space.flat_dim)
         assert samples['actions'].shape == (max_path_length,
                                             env.action_space.flat_dim)
         assert samples['rewards'].shape == (max_path_length, )
         assert samples['baselines'].shape == (max_path_length, )
         assert samples['returns'].shape == (max_path_length, )
         # there is only 1 path
         assert samples['lengths'].shape == (1, )
         for key, shape in policy.state_info_specs:
             assert samples['agent_infos'][key].shape == (max_path_length,
                                                          np.prod(shape))
         # DummyBoxEnv has env_info dummy
         assert samples['env_infos']['dummy'].shape == (max_path_length, )
         assert isinstance(samples['average_return'], float)
コード例 #17
0
ファイル: test_vpg.py プロジェクト: seba-1511/metarl
    def test_vpg_cartpole(self):
        """Test VPG with CartPole-v1 environment."""
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(env_name='CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = VPG(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=100,
                       discount=0.99,
                       optimizer_args=dict(
                           tf_optimizer_args=dict(learning_rate=0.01, )))

            runner.setup(algo, env)

            last_avg_ret = runner.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 90

            env.close()
コード例 #18
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(DmControlEnv.from_suite(*task))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=5,
                discount=0.99,
                max_kl_step=0.01,
            )

            runner.setup(algo, env)
            runner.train(n_epochs=1, batch_size=10)

            env.close()
コード例 #19
0
ファイル: cem_cartpole.py プロジェクト: seba-1511/metarl
def cem_cartpole(ctxt=None, seed=1):
    """Train CEM with Cartpole-v1 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        n_samples = 20

        algo = CEM(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   best_frac=0.05,
                   max_path_length=100,
                   n_samples=n_samples)

        runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler)
        runner.train(n_epochs=100, batch_size=1000)
コード例 #20
0
    def test_trpo_cnn_cubecrash(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('CubeCrash-v0')))

            policy = CategoricalCNNPolicy(env_spec=env.spec,
                                          conv_filters=(32, 64),
                                          conv_filter_sizes=(8, 4),
                                          conv_strides=(4, 2),
                                          conv_pad='VALID',
                                          hidden_sizes=(32, 32))

            baseline = GaussianCNNBaseline(env_spec=env.spec,
                                           regressor_args=dict(
                                               num_filters=(32, 64),
                                               filter_dims=(8, 4),
                                               strides=(4, 2),
                                               padding='VALID',
                                               hidden_sizes=(32, 32),
                                               use_trust_region=True))

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        gae_lambda=0.98,
                        max_kl_step=0.01,
                        policy_ent_coeff=0.0,
                        flatten_input=False)

            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > -0.9

            env.close()
コード例 #21
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config,
                       max_cpus=n_envs) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=max_path_length,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo=algo,
                     env=env,
                     sampler_cls=BatchSampler,
                     sampler_args={'n_envs': n_envs})

        runner.train(n_epochs=100, batch_size=4000, plot=False)
コード例 #22
0
def run_task(snapshot_config, *_):
    """Defines the main experiment routine.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): Configuration
            values for snapshotting.
        *_ (object): Hyperparameters (unused).

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01,
                    optimizer=ConjugateGradientOptimizer,
                    optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)))

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)
コード例 #23
0
def ppo_cmb(env, seed, log_dir):
    """Create test continuous mlp baseline on ppo.

    Args:
        env (gym_env): Environment of the task.
        seed (int): Random seed for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: training results in csv format.

    """
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=num_proc,
                            inter_op_parallelism_threads=num_proc)
    sess = tf.Session(config=config)
    with LocalTFRunner(snapshot_config, sess=sess,
                       max_cpus=num_proc) as runner:
        env = TfEnv(normalize(env))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=policy_params['policy_hidden_sizes'],
            hidden_nonlinearity=policy_params['hidden_nonlinearity'],
        )

        baseline = ContinuousMLPBaseline(
            env_spec=env.spec,
            regressor_args=baseline_params['regressor_args'],
        )

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=algo_params['max_path_length'],
                   discount=algo_params['discount'],
                   gae_lambda=algo_params['gae_lambda'],
                   lr_clip_range=algo_params['lr_clip_range'],
                   entropy_method=algo_params['entropy_method'],
                   policy_ent_coeff=algo_params['policy_ent_coeff'],
                   optimizer_args=algo_params['optimizer_args'],
                   center_adv=algo_params['center_adv'],
                   stop_entropy_gradient=True)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo,
                     env,
                     sampler_args=dict(n_envs=algo_params['n_envs']))
        runner.train(n_epochs=algo_params['n_epochs'],
                     batch_size=algo_params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
コード例 #24
0
    def test_te_ppo(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            algo = TEPPO(env_spec=self.env.spec,
                         policy=self.policy,
                         baseline=self.baseline,
                         inference=self.inference,
                         max_path_length=self.max_path_length,
                         discount=0.99,
                         lr_clip_range=0.2,
                         policy_ent_coeff=self.policy_ent_coeff,
                         encoder_ent_coeff=self.encoder_ent_coeff,
                         inference_ce_coeff=self.inference_ce_coeff,
                         entropy_method='max',
                         stop_entropy_gradient=True,
                         use_softplus_entropy=True,
                         optimizer_args=dict(
                             batch_size=32,
                             max_epochs=10,
                         ),
                         inference_optimizer_args=dict(
                             batch_size=32,
                             max_epochs=10,
                         ),
                         center_adv=True,
                         stop_ce_gradient=True)

            runner.setup(algo,
                         self.env,
                         sampler_cls=LocalSampler,
                         sampler_args=None,
                         worker_class=TaskEmbeddingWorker)
            runner.train(n_epochs=1, batch_size=self.batch_size, plot=False)
コード例 #25
0
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
        *_ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('FetchReach-v1'))

        action_noise = OUStrategy(env.spec, sigma=0.2)

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name='Policy',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name='QFunction',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1e6),
                                        time_horizon=100,
                                        replay_k=0.4,
                                        reward_fun=env.compute_reward)

        ddpg = DDPG(
            env_spec=env.spec,
            policy=policy,
            policy_lr=1e-3,
            qf_lr=1e-3,
            qf=qf,
            replay_buffer=replay_buffer,
            target_update_tau=0.05,
            steps_per_epoch=20,
            max_path_length=100,
            n_train_steps=40,
            discount=0.9,
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=50, batch_size=100)
コード例 #26
0
def run_task(snapshot_config, *_):
    """Wrap TD3 training task in the run_task function.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): Configuration
            values for snapshotting.
        *_ (object): Hyperparameters (unused).

    """
    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(gym.make('InvertedDoublePendulum-v2'))

        action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1)

        policy = ContinuousMLPPolicy(env_spec=env.spec,
                                     hidden_sizes=[400, 300],
                                     hidden_nonlinearity=tf.nn.relu,
                                     output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction',
                                    env_spec=env.spec,
                                    hidden_sizes=[400, 300],
                                    action_merge_layer=0,
                                    hidden_nonlinearity=tf.nn.relu)

        qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2',
                                     env_spec=env.spec,
                                     hidden_sizes=[400, 300],
                                     action_merge_layer=0,
                                     hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                           size_in_transitions=int(1e6),
                                           time_horizon=250)

        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  policy_lr=1e-4,
                  qf_lr=1e-3,
                  qf=qf,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  target_update_tau=1e-2,
                  steps_per_epoch=20,
                  n_train_steps=1,
                  smooth_return=False,
                  discount=0.99,
                  buffer_batch_size=100,
                  min_buffer_size=1e4,
                  exploration_strategy=action_noise,
                  policy_optimizer=tf.train.AdamOptimizer,
                  qf_optimizer=tf.train.AdamOptimizer)

        runner.setup(td3, env)
        runner.train(n_epochs=500, batch_size=250)
コード例 #27
0
ファイル: ppo_pendulum.py プロジェクト: seba-1511/metarl
def tf_ppo_pendulum(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=True,
            ),
        )

        # NOTE: make sure when setting entropy_method to 'max', set
        # center_adv to False and turn off policy gradient. See
        # tf.algos.NPO for detailed documentation.
        algo = RL2PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                learning_rate=1e-3,
            ),
            stop_entropy_gradient=True,
            entropy_method='max',
            policy_ent_coeff=0.002,
            center_adv=False,
        )

        runner.setup(algo, env)

        runner.train(n_epochs=120, batch_size=4096, plot=False)
コード例 #28
0
ファイル: test_td3.py プロジェクト: seba-1511/metarl
    def test_td3_pendulum(self):
        """Test TD3 with Pendulum environment."""
        with LocalTFRunner(snapshot_config) as runner:
            env = TfEnv(gym.make('InvertedDoublePendulum-v2'))

            action_noise = GaussianStrategy(env.spec,
                                            max_sigma=0.1,
                                            min_sigma=0.1)

            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[400, 300],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)

            qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction',
                                        env_spec=env.spec,
                                        hidden_sizes=[400, 300],
                                        action_merge_layer=0,
                                        hidden_nonlinearity=tf.nn.relu)

            qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2',
                                         env_spec=env.spec,
                                         hidden_sizes=[400, 300],
                                         action_merge_layer=0,
                                         hidden_nonlinearity=tf.nn.relu)

            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=250)

            algo = TD3(env_spec=env.spec,
                       policy=policy,
                       policy_lr=1e-3,
                       qf_lr=1e-3,
                       qf=qf,
                       qf2=qf2,
                       replay_buffer=replay_buffer,
                       steps_per_epoch=20,
                       target_update_tau=0.005,
                       n_train_steps=50,
                       discount=0.99,
                       smooth_return=False,
                       min_buffer_size=int(1e4),
                       buffer_batch_size=100,
                       policy_weight_decay=0.001,
                       qf_weight_decay=0.001,
                       exploration_strategy=action_noise,
                       policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                       qf_optimizer=tf.compat.v1.train.AdamOptimizer)

            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=250)
            assert last_avg_ret > 400
コード例 #29
0
    def test_dqn_cartpole_pickle(self):
        """Test DQN with CartPole environment."""
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            n_epochs = 10
            steps_per_epoch = 10
            sampler_batch_size = 500
            num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
            env = TfEnv(gym.make('CartPole-v0'))
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e4),
                                               time_horizon=1)
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
            policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
            epilson_greedy_strategy = EpsilonGreedyStrategy(
                env_spec=env.spec,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.02,
                decay_ratio=0.1)
            algo = DQN(env_spec=env.spec,
                       policy=policy,
                       qf=qf,
                       exploration_strategy=epilson_greedy_strategy,
                       replay_buffer=replay_buffer,
                       qf_lr=1e-4,
                       discount=1.0,
                       min_buffer_size=int(1e3),
                       double_q=False,
                       n_train_steps=500,
                       grad_norm_clipping=5.0,
                       steps_per_epoch=steps_per_epoch,
                       target_network_update_freq=1,
                       buffer_batch_size=32)
            runner.setup(algo, env)
            with tf.compat.v1.variable_scope(
                    'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True):
                bias = tf.compat.v1.get_variable('bias')
                # assign it to all one
                old_bias = tf.ones_like(bias).eval()
                bias.load(old_bias)
                h = pickle.dumps(algo)

            with tf.compat.v1.Session(graph=tf.Graph()):
                pickle.loads(h)
                with tf.compat.v1.variable_scope(
                        'DiscreteMLPQFunction/MLPModel/mlp/hidden_0',
                        reuse=True):
                    new_bias = tf.compat.v1.get_variable('bias')
                    new_bias = new_bias.eval()
                    assert np.array_equal(old_bias, new_bias)

            env.close()
コード例 #30
0
def run_metarl(env, seed, log_dir):
    '''
    Create metarl model and training.
    Replace the ppo with the algorithm you want to run.
    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)
    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))

        policy = CategoricalLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=32,
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env, sampler_args=dict(n_envs=12))
        runner.train(n_epochs=488, batch_size=2048)
        dowel_logger.remove_all()

        return tabular_log_file