Ejemplo n.º 1
0
from examples.point_env_randgoal import PointEnvRandGoal
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
#from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.policies.sens_minimal_gauss_mlp_policy import SensitiveGaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv

import tensorflow as tf

stub(globals())

#env = TfEnv(normalize(PointEnv()))
env = TfEnv(normalize(PointEnvRandGoal()))
policy = SensitiveGaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    grad_step_size=1.0,
    hidden_nonlinearity=tf.nn.relu,
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
#baseline = ZeroBaseline(env_spec=env.spec)
algo = SensitiveVPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=20,  # use 100 trajs for grad update
    max_path_length=5,
    meta_batch_size=100,
    n_itr=100,
    use_sensitive=False,
    optimizer_args={'learning_rate': 1e-3}
    #plot=True,
Ejemplo n.º 2
0
fast_batch_size = 20  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
max_path_length = 500
num_grad_updates = 1
use_sensitive = True

for fast_learning_rate in fast_learning_rates:
    for learning_rate in learning_rates:
        for bas in baselines:
            stub(globals())

            env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
            policy = SensitiveGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif bas == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = SensitiveTRPO(
                #algo = SensitiveVPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=fast_batch_size,  # number of trajs for grad update
Ejemplo n.º 3
0
    task_var = v['task_var']
    mask = v['mask']

    if task_var == 0:
        env = TfEnv(normalize(AntEnvRandDirec()))
        task_var = 'direc'
    elif task_var == 1:
        env = TfEnv(normalize(AntEnvRand()))
        task_var = 'vel'
    elif task_var == 2:
        env = TfEnv(normalize(AntEnvRandGoal()))
        task_var = 'pos'
    policy = SensitiveGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100, 100),
        mask_units=mask,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = SensitiveTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v['fast_batch_size'],  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=v['meta_batch_size'],
        num_grad_updates=num_grad_updates,
        n_itr=800,
        use_sensitive=use_sensitive,