def run_task(*_):
    env = TfEnv(normalize(CartpoleEnv()))
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=None,
    )
    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args=dict(hidden_sizes=(32, 32)),
    )
    algo = InstrumentedTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,
        max_path_length=100,
        n_itr=4,
        discount=0.99,
        gae_lambda=0.98,
        policy_ent_coeff=0.0,
        plot=True,
    )
    algo.train()
Esempio n. 2
0
def trpo_swimmer(ctxt=None, seed=1, batch_size=4000):
    """Train TRPO with Swimmer-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = GymEnv('Swimmer-v2')

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_episode_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=batch_size)
Esempio n. 3
0
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    latent_policy = joblib.load(latent_policy_pkl)["policy"]

    inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100)
    env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(64, 64),
                               init_std=20,
                               std_share_network=False,
                               adaptive_std=True)
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=50,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
        use_mpc_es=True,
    )
    algo.train(sess=sess)
Esempio n. 4
0
    def test_npo_pendulum(self):
        """Test NPO with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = NPO(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=100,
                       discount=0.99,
                       gae_lambda=0.98,
                       policy_ent_coeff=0.0)
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 20

            env.close()
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(DmControlEnv.from_suite(*task))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=5,
                discount=0.99,
                max_kl_step=0.01,
            )

            runner.setup(algo, env)
            runner.train(n_epochs=1, batch_size=10)

            env.close()
Esempio n. 6
0
 def test_ppo_pendulum(self):
     """Test PPO with Pendulum environment."""
     logger._tensorboard = TensorBoardOutput()
     env = TfEnv(normalize(gym.make("Pendulum-v0")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(32, 32),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     algo = PPO(
         env=env,
         policy=policy,
         baseline=baseline,
         batch_size=1024,
         max_path_length=100,
         n_itr=10,
         discount=0.99,
         gae_lambda=0.98,
         policy_ent_coeff=0.0,
         plot=False,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > -1000
Esempio n. 7
0
    def test_gaussian_mlp_policy(self):
        gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env,
                                                hidden_sizes=(1, ))
        self.sess.run(tf.global_variables_initializer())

        obs = self.env.observation_space.high
        assert gaussian_mlp_policy.get_action(obs)
Esempio n. 8
0
def run_task(v):
    env = TfEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers,
        # each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
def run_task(*_):
    env = TfEnv(
        normalize(
            OneHotMultiTaskEnv(task_env_cls=PR2ArmClockEnv,
                               task_args=TASK_ARGS,
                               task_kwargs=TASK_KWARGS)))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=400000000,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
Esempio n. 10
0
    def test_trpo_pendulum(self):
        """Test TRPO with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            gae_lambda=0.98,
            policy_ent_coeff=0.0,
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 50

        env.close()
Esempio n. 11
0
def run_task(snapshot_config, *_):
    """Run the job.

    Args:
        snapshot_config (garage.experiment.SnapshotConfig): Configuration
            values for snapshotting.
        *_ (object): Hyperparameters (unused).

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=ISSampler,
                     sampler_args=dict(n_backtrack=1))
        runner.train(n_epochs=200, batch_size=4000)
Esempio n. 12
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(
            normalize(
                OneHotMultiTaskEnv(task_env_cls=PointEnv,
                                   task_args=TASK_ARGS,
                                   task_kwargs=TASK_KWARGS)))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            max_kl_step=0.01,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)
Esempio n. 13
0
def run_task(v):
    v = SimpleNamespace(**v)

    with LocalRunner() as runner:
        # Environment
        env = SimpleReacherEnv(goal_position=GOALS[0],
                               control_method="position_control",
                               completion_bonus=5)

        env = TfEnv(env)

        # Policy
        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(64, 32),
            init_std=v.policy_init_std,
        )

        baseline = GaussianMLPBaseline(env_spec=env.spec)

        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=v.max_path_length,
            discount=0.99,
            lr_clip_range=0.2,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=True,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=v.batch_size, plot=False)
Esempio n. 14
0
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    snapshot = joblib.load(latent_policy_pkl)
    latent_policy = snapshot["policy"]
    inner_env = SimpleReacherEnv(goal_position=(0.65, 0.3, 0.3),
                                 control_method="position_control",
                                 completion_bonus=30)

    env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env,
        hidden_sizes=(64, 64),
        init_std=20,
        # std_share_network=False,
        # adaptive_std=True
    )
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=100,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
    )
    algo.train(sess=sess)
Esempio n. 15
0
    def test_is_pickleable(self, obs_dim, action_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianMLPPolicy(env_spec=env.spec)

        obs = env.reset()

        with tf.compat.v1.variable_scope('GaussianMLPPolicy', reuse=True):
            bias = tf.compat.v1.get_variable(
                'dist_params/mean_network/hidden_0/bias')
        # assign it to all one
        bias.load(tf.ones_like(bias).eval())

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None,
                                                      policy.input_dim))
        dist_sym = policy.build(state_input, name='dist_sym').dist
        output1 = self.sess.run([dist_sym.loc, dist_sym.stddev()],
                                feed_dict={state_input: [[obs.flatten()]]})

        p = pickle.dumps(policy)
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            state_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, None,
                                                          policy.input_dim))
            dist_sym = policy_pickled.build(state_input, name='dist_sym').dist
            output2 = sess.run([dist_sym.loc, dist_sym.stddev()],
                               feed_dict={state_input: [[obs.flatten()]]})
            assert np.array_equal(output1, output2)
Esempio n. 16
0
def run_task(*_):

    env = TfEnv(
        normalize(
            MinibotEnv(
                use_maps=[0, 1],  # 'all',  # [0,1]
                discretized=True)))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=5000,
                max_path_length=100,
                n_itr=15,
                discount=0.99,
                step_size=0.01,
                plot=plot,
                pause_for_plot=False)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as session:
        algo.train(sess=session)
Esempio n. 17
0
def run_task(*_):
    """
    Wrap PPO training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))

    policy = GaussianMLPPolicy(
        name="policy", env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2048,
        max_path_length=100,
        n_itr=488,
        discount=0.99,
        step_size=0.01,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=False)
    algo.train()
def run_task(*_):
    with LocalRunner() as runner:
        env = PointEnv(goal=(3, 3), random_start=True)
        env = TfEnv(env)

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(64, 64),
                                   init_std=20,
                                   std_share_network=False,
                                   adaptive_std=True)

        baseline = GaussianMLPBaseline(env_spec=env,
                                       include_action_to_input=False)

        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1024,  # 4096
            max_path_length=50,
            n_itr=1500,
            discount=0.99,
            step_size=0.2,
            policy_ent_coeff=1e-6,
            use_mpc_es=True,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=1500, batch_size=1024, plot=True)
Esempio n. 19
0
def trpo_garage_tf(ctxt, env_id, seed):
    """Create garage Tensorflow TROI model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    discount=hyper_parameters['discount'],
                    gae_lambda=hyper_parameters['gae_lambda'],
                    max_kl_step=hyper_parameters['max_kl'])

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_parameters['n_epochs'],
                      batch_size=hyper_parameters['batch_size'])
def trpois_inverted_pendulum(ctxt=None, seed=1):
    """Train TRPO on InvertedPendulum-v2 with importance sampling.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make('InvertedPendulum-v2')))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=ISSampler,
                     sampler_args=dict(n_backtrack=1))
        runner.train(n_epochs=200, batch_size=4000)
Esempio n. 21
0
    def test_trpo_unknown_kl_constraint(self):
        """Test TRPO with unkown KL constraints."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        with self.assertRaises(NotImplementedError) as context:
            TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=2048,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                gae_lambda=0.98,
                policy_ent_coeff=0.0,
                plot=False,
                kl_constraint="random kl_constraint",
            )
        assert "Unknown KLConstraint" in str(context.exception)

        env.close()
Esempio n. 22
0
def run_task(vv):
    with LocalRunner() as runner:
        env = TfEnv(normalize(gym.make('HalfCheetah-v1')))

        policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=(32, 32),
                                   name="policy")

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            step_size=vv["step_size"],
        )

        runner.setup(algo=algo, env=env)

        runner.train(
            n_epochs=40,
            batch_size=4000,
            # Uncomment to enable plotting
            # plot=True
        )
Esempio n. 23
0
    def test_tnpg_inverted_pendulum(self):
        """Test TNPG with InvertedPendulum-v2 environment."""
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('InvertedPendulum-v2'))

            policy = GaussianMLPPolicy(name='policy',
                                       env_spec=env.spec,
                                       hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            sampler = LocalSampler(
                agents=policy,
                envs=env,
                max_episode_length=env.spec.max_episode_length,
                is_tf_worker=True)

            algo = TNPG(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        sampler=sampler,
                        discount=0.99,
                        optimizer_args=dict(reg_coeff=5e-1))

            trainer.setup(algo, env)

            last_avg_ret = trainer.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 15

            env.close()
Esempio n. 24
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = DMControlEnv.from_suite(*task)

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                discount=0.99,
                max_kl_step=0.01,
            )

            trainer.setup(algo, env, sampler_cls=LocalSampler)
            trainer.train(n_epochs=1, batch_size=10)

            env.close()
Esempio n. 25
0
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (garage.experiment.SnapshotConfig): Configuration
            values for snapshotting.
        *_ (object): Hyperparameters (unused).

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('Swimmer-v2'))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=RaySamplerTF,
                     sampler_args={'seed': seed})
        runner.train(n_epochs=40, batch_size=4000)
Esempio n. 26
0
 def test_process_samples_continuous_non_recurrent(self):
     env = TfEnv(DummyBoxEnv())
     policy = GaussianMLPPolicy(env_spec=env.spec)
     baseline = GaussianMLPBaseline(env_spec=env.spec)
     max_path_length = 100
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = BatchPolopt2(env_spec=env.spec,
                             policy=policy,
                             baseline=baseline,
                             max_path_length=max_path_length,
                             flatten_input=True)
         runner.setup(algo, env, sampler_args=dict(n_envs=1))
         runner.train(n_epochs=1, batch_size=max_path_length)
         paths = runner.obtain_samples(0)
         samples = algo.process_samples(0, paths)
         # Since there is only 1 vec_env in the sampler and DummyBoxEnv
         # never terminate until it reaches max_path_length, batch size
         # must be max_path_length, i.e. 100
         assert samples['observations'].shape == (
             max_path_length, env.observation_space.flat_dim)
         assert samples['actions'].shape == (max_path_length,
                                             env.action_space.flat_dim)
         assert samples['rewards'].shape == (max_path_length, )
         assert samples['baselines'].shape == (max_path_length, )
         assert samples['returns'].shape == (max_path_length, )
         # there is only 1 path
         assert samples['lengths'].shape == (1, )
         # non-recurrent policy has empty agent info
         assert samples['agent_infos'] == {}
         # DummyBoxEnv has env_info dummy
         assert samples['env_infos']['dummy'].shape == (max_path_length, )
         assert isinstance(samples['average_return'], float)
    def test_ppo_pendulum_continuous_baseline(self):
        """Test PPO with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = ContinuousMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=100,
                       discount=0.99,
                       lr_clip_range=0.01,
                       optimizer_args=dict(batch_size=32, max_epochs=10))
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 30

            env.close()
Esempio n. 28
0
def multi_env_trpo(ctxt=None, seed=1):
    """Train TRPO on two different PointEnv instances.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.))))
        env2 = TfEnv(normalize(PointEnv(goal=(1., 0.))))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=2048, plot=False)
Esempio n. 29
0
def test_meta_evaluator_with_tf():
    set_seed(100)
    tasks = SetTaskSampler(PointEnv, wrapper=set_length)
    max_episode_length = 200
    env = PointEnv()
    n_eps = 3
    with tempfile.TemporaryDirectory() as log_dir_name:
        ctxt = SnapshotConfig(snapshot_dir=log_dir_name,
                              snapshot_mode='none',
                              snapshot_gap=1)
        with TFTrainer(ctxt) as trainer:
            meta_eval = MetaEvaluator(test_task_sampler=tasks,
                                      n_test_tasks=10,
                                      n_exploration_eps=n_eps)
            policy = GaussianMLPPolicy(env.spec)
            algo = MockAlgo(env, policy, max_episode_length, n_eps, meta_eval)
            trainer.setup(algo, env)
            log_file = tempfile.NamedTemporaryFile()
            csv_output = CsvOutput(log_file.name)
            logger.add_output(csv_output)
            meta_eval.evaluate(algo)
            algo_pickle = cloudpickle.dumps(algo)
        tf.compat.v1.reset_default_graph()
        with TFTrainer(ctxt) as trainer:
            algo2 = cloudpickle.loads(algo_pickle)
            trainer.setup(algo2, env)
            trainer.train(10, 0)
Esempio n. 30
0
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.

        _ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.))))
        env2 = TfEnv(normalize(PointEnv(goal=(1., 0.))))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=2048, plot=False)