def test_param_space_noise(self):
        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(name="policy33",
                                               env_spec=env.spec,
                                               hidden_sizes=(16, 16),
                                               hidden_nonlinearity=tf.nn.tanh,
                                               param_noise_std=0.0)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        import rllab.misc.logger as logger

        logger.set_snapshot_dir('/tmp/')
        logger.set_snapshot_mode('last')

        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=3,
            max_path_length=10,
            meta_batch_size=4,
            num_grad_updates=1,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )
        algo.train()

        tf.reset_default_graph()
        pkl_file = os.path.join('/tmp/', 'params.pkl')
        with tf.Session() as sess:
            data = joblib.load(pkl_file)
            policy = data['policy']
            action_1 = policy.get_action(obs)[1]['mean']
            action_2 = policy.get_action(obs)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)

            self.assertAlmostEquals(diff, 0.0)

            action_1 = policy.get_action(obs, param_noise_std=1.0)[1]['mean']
            action_2 = policy.get_action(obs, param_noise_std=1.0)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)

            self.assertGreaterEqual(diff, 0.1)

            policy.param_noise_std = 1.0
            action_1 = policy.get_action(obs)[1]['mean']
            action_2 = policy.get_action(obs)[1]['mean']
            diff = np.sum((action_1 - action_2)**2)
            self.assertGreaterEqual(diff, 0.1)
    def test_serialization(self):

        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy56",
            env_spec=env.spec,
            hidden_sizes=(16, 16),
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        import rllab.misc.logger as logger

        logger.set_snapshot_dir('/tmp/')
        logger.set_snapshot_mode('last')

        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2,
            max_path_length=10,
            meta_batch_size=4,
            num_grad_updates=1,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )
        algo.train()

        tf.reset_default_graph()
        pkl_file = os.path.join('/tmp/', 'params.pkl')
        with tf.Session() as sess:
            data = joblib.load(pkl_file)
            policy = data['policy']
            action_before = policy.get_action(obs)[1]['mean']

            dump_string = pickle.dumps(policy)

        tf.reset_default_graph()
        with tf.Session() as sess:
            policy_loaded = pickle.loads(dump_string)
            action_after = policy_loaded.get_action(obs)[1]['mean']

        diff = np.sum(np.abs(action_before - action_after))
        self.assertAlmostEquals(diff, 0.0, places=3)
    def test_get_mean_stepsize(self):

        env = TfEnv(normalize(PointEnvMAML()))
        obs = env.reset()

        policy = MAMLImprovedGaussianMLPPolicy(name="policy2",
                                               env_spec=env.spec,
                                               hidden_sizes=(16, 16),
                                               hidden_nonlinearity=tf.nn.tanh,
                                               trainable_step_size=True,
                                               grad_step_size=0.7)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            mean_stepsize_1 = policy.get_mean_step_size()

        self.assertAlmostEquals(mean_stepsize_1, 0.7, places=5)
    initial_params_file1, initial_params_file3, None, initial_params_file4
]
gen_name = 'icml_point_results_'
names = ['maml', 'maml0', 'random', 'oracle']

exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)),
                                       initial_params_files):
    avg_returns = []
    for goal in goals:
        goal = list(goal)

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(PointEnvRandGoalOracle(goal=goal))
            n_itr = 1
        else:
            env = normalize(PointEnvRandGoal(goal=goal))
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None

        baseline = LinearFeatureBaseline(env_spec=env.spec)
Example #5
0
from sandbox_maml.rocky.tf.algos.trpo import TRPO
from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab_maml.envs.box2d.cartpole_env import CartpoleEnv
from rllab_maml.envs.normalized_env import normalize
from sandbox_maml.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy
from sandbox_maml.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
from sandbox_maml.rocky.tf.envs.base import TfEnv
import sandbox_maml.rocky.tf.core.layers as L
from sandbox_maml.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
from rllab_maml.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianLSTMPolicy(
    name="policy",
    env_spec=env.spec,
    lstm_layer_cls=L.TfBasicLSTMLayer,
    # gru_layer_cls=L.GRULayer,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=10,
gen_name = 'icml_ant_results_'
names = ['maml','pretrain','random', 'oracle']
exp_names = [gen_name + name for name in names]

step_sizes = [0.1, 0.2, 1.0, 0.0]
initial_params_files = [file1, file2, None, file3]


all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []

    for goal in goals:

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(AntEnvOracle())
            n_itr = 1
        else:
            env = normalize(AntEnvRand())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
gen_name = 'icml_cheetah_results_'
names = ['maml', 'pretrain', 'random', 'oracle']
exp_names = [gen_name + name for name in names]

step_sizes = [0.1, 0.02, 0.1, 0.0]
initial_params_files = [file1, file2, None, file3]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)),
                                       initial_params_files):
    avg_returns = []
    for goal in goals:

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(HalfCheetahEnvOracle())
            n_itr = 1
        else:
            env = normalize(HalfCheetahEnvRand())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
Example #8
0
step_sizes = [0.1, 0.2, 1.0, 0.0]
initial_params_files = [file1, file2, None, file3]

names = ['random']
exp_names = [gen_name + name for name in names]
initial_params_files = [None]
step_sizes = [0.5]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)),
                                       initial_params_files):
    avg_returns = []
    for goal_i, goal in zip(range(len(goals)), goals):

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(HalfCheetahEnvDirecOracle())
            n_itr = 1
        else:
            env = normalize(HalfCheetahEnvRandDirec())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
        return [2]


# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    task_var = v['task_var']

    if task_var == 0:
        env = TfEnv(normalize(AntEnvRandDirec()))
        task_var = 'direc'
    elif task_var == 1:
        env = TfEnv(normalize(AntEnvRand()))
        task_var = 'vel'
    elif task_var == 2:
        env = TfEnv(normalize(AntEnvRandGoal()))
        task_var = 'pos'
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )

# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    direc = v['direc']
    learning_rate = v['meta_step_size']

    if direc:
        env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
    else:
        env = TfEnv(normalize(HalfCheetahEnvRand()))
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
Example #11
0
baselines = ['linear']
fast_batch_size = 20  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
meta_batch_size = 40  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
max_path_length = 100
num_grad_updates = 1
meta_step_size = 0.01

use_maml = True
interpreter_path = sys.executable

for fast_learning_rate in fast_learning_rates:
    for learning_rate in learning_rates:
        for bas in baselines:
            stub(globals())

            env = TfEnv(normalize(PointEnvRandGoal()))
            policy = MAMLGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif 'linear' in bas:
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = MAMLTRPO(
                env=env,
fast_learning_rates = [0.1]
baselines = ['linear']
fast_batch_size = 20
meta_batch_size = 60
max_path_length = 10
num_grad_updates = 1
meta_step_size = 0.01

use_maml = True

for fast_learning_rate in fast_learning_rates:
    for bas in baselines:
        stub(globals())

        env = TfEnv(normalize(GridWorldEnvRand('four-state')))
        policy = MAMLCategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=fast_learning_rate,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100,100),
        )
        if bas == 'zero':
            baseline = ZeroBaseline(env_spec=env.spec)
        elif 'linear' in bas:
            baseline = LinearFeatureBaseline(env_spec=env.spec)
        else:
            baseline = GaussianMLPBaseline(env_spec=env.spec)
        algo = MAMLTRPO(
            env=env,
from sandbox_maml.rocky.tf.algos.trpo import TRPO
from sandbox_maml.rocky.tf.algos.vpg import VPG
from sandbox_maml.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
from sandbox_maml.rocky.tf.envs.base import TfEnv
from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
from maml_examples.point_env_randgoal_oracle import PointEnvRandGoalOracle
from rllab_maml.envs.normalized_env import normalize
from rllab_maml.misc.instrument import stub, run_experiment_lite

stub(globals())

import tensorflow as tf

env = normalize(PointEnvRandGoalOracle())

env = TfEnv(env)
policy = GaussianMLPPolicy(name='policy',
                           env_spec=env.spec,
                           hidden_nonlinearity=tf.nn.relu,
                           hidden_sizes=(100, 100))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=1000,
    max_path_length=100,
    n_itr=100,
    discount=0.99,
Example #14
0
variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml = True

for v in variants:
    task_var = v['task_var']
    oracle = v['oracle']

    if task_var == 0:
        task_var = 'direc'
        exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvDirecOracle()))
        else:
            env = TfEnv(normalize(AntEnvRandDirec()))
    elif task_var == 1:
        task_var = 'vel'
        exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvOracle()))
        else:
            env = TfEnv(normalize(AntEnvRand()))
    elif task_var == 2:
        task_var = 'pos'
        exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvRandGoalOracle()))
        else:
Example #15
0
gen_name = 'icml_antdirec_results_'
names = ['maml', 'pretrain', 'random', 'oracle']
step_sizes = [0.1, 0.2, 1.0, 0.0]
initial_params_files = [file1, file2, None, file3]

exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)),
                                       initial_params_files):
    avg_returns = []
    for goal_i, goal in zip(range(len(goals)), goals):

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(AntEnvDirecOracle())
            n_itr = 1
        else:
            env = normalize(AntEnvRandDirec())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None