def test_param_space_noise(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy(name="policy33", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, param_noise_std=0.0) baseline = LinearFeatureBaseline(env_spec=env.spec) import rllab.misc.logger as logger logger.set_snapshot_dir('/tmp/') logger.set_snapshot_mode('last') algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=3, max_path_length=10, meta_batch_size=4, num_grad_updates=1, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() tf.reset_default_graph() pkl_file = os.path.join('/tmp/', 'params.pkl') with tf.Session() as sess: data = joblib.load(pkl_file) policy = data['policy'] action_1 = policy.get_action(obs)[1]['mean'] action_2 = policy.get_action(obs)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertAlmostEquals(diff, 0.0) action_1 = policy.get_action(obs, param_noise_std=1.0)[1]['mean'] action_2 = policy.get_action(obs, param_noise_std=1.0)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertGreaterEqual(diff, 0.1) policy.param_noise_std = 1.0 action_1 = policy.get_action(obs)[1]['mean'] action_2 = policy.get_action(obs)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertGreaterEqual(diff, 0.1)
def test_serialization(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy( name="policy56", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) import rllab.misc.logger as logger logger.set_snapshot_dir('/tmp/') logger.set_snapshot_mode('last') algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=2, max_path_length=10, meta_batch_size=4, num_grad_updates=1, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() tf.reset_default_graph() pkl_file = os.path.join('/tmp/', 'params.pkl') with tf.Session() as sess: data = joblib.load(pkl_file) policy = data['policy'] action_before = policy.get_action(obs)[1]['mean'] dump_string = pickle.dumps(policy) tf.reset_default_graph() with tf.Session() as sess: policy_loaded = pickle.loads(dump_string) action_after = policy_loaded.get_action(obs)[1]['mean'] diff = np.sum(np.abs(action_before - action_after)) self.assertAlmostEquals(diff, 0.0, places=3)
def test_get_mean_stepsize(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy(name="policy2", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, trainable_step_size=True, grad_step_size=0.7) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) mean_stepsize_1 = policy.get_mean_step_size() self.assertAlmostEquals(mean_stepsize_1, 0.7, places=5)
initial_params_file1, initial_params_file3, None, initial_params_file4 ] gen_name = 'icml_point_results_' names = ['maml', 'maml0', 'random', 'oracle'] exp_names = [gen_name + name for name in names] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: goal = list(goal) if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(PointEnvRandGoalOracle(goal=goal)) n_itr = 1 else: env = normalize(PointEnvRandGoal(goal=goal)) n_itr = 5 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None baseline = LinearFeatureBaseline(env_spec=env.spec)
from sandbox_maml.rocky.tf.algos.trpo import TRPO from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab_maml.envs.box2d.cartpole_env import CartpoleEnv from rllab_maml.envs.normalized_env import normalize from sandbox_maml.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy from sandbox_maml.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy from sandbox_maml.rocky.tf.envs.base import TfEnv import sandbox_maml.rocky.tf.core.layers as L from sandbox_maml.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab_maml.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.TfBasicLSTMLayer, # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10,
gen_name = 'icml_ant_results_' names = ['maml','pretrain','random', 'oracle'] exp_names = [gen_name + name for name in names] step_sizes = [0.1, 0.2, 1.0, 0.0] initial_params_files = [file1, file2, None, file3] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(AntEnvOracle()) n_itr = 1 else: env = normalize(AntEnvRand()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
gen_name = 'icml_cheetah_results_' names = ['maml', 'pretrain', 'random', 'oracle'] exp_names = [gen_name + name for name in names] step_sizes = [0.1, 0.02, 0.1, 0.0] initial_params_files = [file1, file2, None, file3] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(HalfCheetahEnvOracle()) n_itr = 1 else: env = normalize(HalfCheetahEnvRand()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
step_sizes = [0.1, 0.2, 1.0, 0.0] initial_params_files = [file1, file2, None, file3] names = ['random'] exp_names = [gen_name + name for name in names] initial_params_files = [None] step_sizes = [0.5] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal_i, goal in zip(range(len(goals)), goals): if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(HalfCheetahEnvDirecOracle()) n_itr = 1 else: env = normalize(HalfCheetahEnvRandDirec()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
return [2] # should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: task_var = v['task_var'] if task_var == 0: env = TfEnv(normalize(AntEnvRandDirec())) task_var = 'direc' elif task_var == 1: env = TfEnv(normalize(AntEnvRand())) task_var = 'vel' elif task_var == 2: env = TfEnv(normalize(AntEnvRandGoal())) task_var = 'pos' policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), )
# should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: direc = v['direc'] learning_rate = v['meta_step_size'] if direc: env = TfEnv(normalize(HalfCheetahEnvRandDirec())) else: env = TfEnv(normalize(HalfCheetahEnvRand())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline,
baselines = ['linear'] fast_batch_size = 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 40 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 100 num_grad_updates = 1 meta_step_size = 0.01 use_maml = True interpreter_path = sys.executable for fast_learning_rate in fast_learning_rates: for learning_rate in learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(PointEnvRandGoal())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env,
fast_learning_rates = [0.1] baselines = ['linear'] fast_batch_size = 20 meta_batch_size = 60 max_path_length = 10 num_grad_updates = 1 meta_step_size = 0.01 use_maml = True for fast_learning_rate in fast_learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(GridWorldEnvRand('four-state'))) policy = MAMLCategoricalMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env,
from sandbox_maml.rocky.tf.algos.trpo import TRPO from sandbox_maml.rocky.tf.algos.vpg import VPG from sandbox_maml.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy from sandbox_maml.rocky.tf.envs.base import TfEnv from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline from maml_examples.point_env_randgoal_oracle import PointEnvRandGoalOracle from rllab_maml.envs.normalized_env import normalize from rllab_maml.misc.instrument import stub, run_experiment_lite stub(globals()) import tensorflow as tf env = normalize(PointEnvRandGoalOracle()) env = TfEnv(env) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1000, max_path_length=100, n_itr=100, discount=0.99,
variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml = True for v in variants: task_var = v['task_var'] oracle = v['oracle'] if task_var == 0: task_var = 'direc' exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvDirecOracle())) else: env = TfEnv(normalize(AntEnvRandDirec())) elif task_var == 1: task_var = 'vel' exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvOracle())) else: env = TfEnv(normalize(AntEnvRand())) elif task_var == 2: task_var = 'pos' exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvRandGoalOracle())) else:
gen_name = 'icml_antdirec_results_' names = ['maml', 'pretrain', 'random', 'oracle'] step_sizes = [0.1, 0.2, 1.0, 0.0] initial_params_files = [file1, file2, None, file3] exp_names = [gen_name + name for name in names] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal_i, goal in zip(range(len(goals)), goals): if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(AntEnvDirecOracle()) n_itr = 1 else: env = normalize(AntEnvRandDirec()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None