def experiment(variant): seed = variant['seed'] n_parallel = 1 log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs'] meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps'] max_path_length = variant['max_path_length'] dagger = variant['dagger'] expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim'] init_flr = variant['init_flr'] policyType = variant['policyType'] use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks) tasks = all_tasks[:meta_batch_size] use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size= meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=1, #100 make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc) algo.train()
def experiment(variant, comet_logger=None): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.algos.vpg import VPG as vpg_basic from sandbox.rocky.tf.algos.vpg_biasADA import VPG as vpg_biasADA from sandbox.rocky.tf.algos.vpg_fullADA import VPG as vpg_fullADA from sandbox.rocky.tf.algos.vpg_conv import VPG as vpg_conv from sandbox.rocky.tf.algos.ppo import PPO as ppo # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \ MAMLGaussianMLPPolicy as biasAda_Bias_policy from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.envs.mujoco.sawyer_xyz.multi_domain.push_door import Sawyer_MultiDomainEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_coffee import SawyerCoffeeEnv from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler # import gym from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \ MAMLGaussianMLPPolicy as PPO_policy import pickle import argparse from sandbox.rocky.tf.envs.base import TfEnv import csv import joblib import numpy as np import pickle import tensorflow as tf print("%%%%%%%%%%%%%%%%%", comet_logger) seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant['tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$") if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Biped' in envType: # import terrainRLSim # from simAdapter import terrainRLSim import simAdapter import gym env = gym.make("PD_Biped2D_Gaps_Terrain-v0") env = TfEnv(normalize(env)) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) # baseline = ZeroBaseline(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger ) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir ) elif policyType == 'PPO': policy = PPO_policy( name="policy", env_spec=env.spec, grad_step_size=variant['init_flr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(128, 128), init_flr_full=variant['init_flr'], latent_dim=variant['ldim'], learn_std=False ) algo = ppo( env=env, policy=policy, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger ) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, # step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={'init_learning_rate': default_step, 'tf_optimizer_args': {'learning_rate': 0.5 * default_step}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # noise_opt = True, default_step=default_step, # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir ) else: raise AssertionError('Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def experiment(variant, comet_exp_key=None): if comet_exp_key is not None: from rllab.misc.comet_logger import CometContinuedLogger, CometLogger from comet_ml import Experiment, ExistingExperiment # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key']) comet_log = ExistingExperiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key']) # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", # project_name="ml4l3", workspace="glenb") comet_log.set_name("test seq train") # comet_log = comet_exp_key print (comet_log) else: comet_log = None print ("loading libraries") from sandbox.rocky.tf.algos.maml_il import MAMLIL from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \ MAMLGaussianMLPPolicy as PPO_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import \ MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \ MAMLGaussianMLPPolicy as biasAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer from sandbox.rocky.tf.envs.base import TfEnv import sandbox.rocky.tf.core.layers as L from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv import tensorflow as tf import time from rllab.envs.gym_env import GymEnv from maml_examples.maml_experiment_vars import MOD_FUNC import numpy as np import random as rd import pickle print ("Done loading libraries") seed = variant['seed']; n_parallel = 1; log_dir = variant['log_dir'] x=0 setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs']; meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps']; max_path_length = variant['max_path_length'] dagger = variant['dagger']; expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim']; init_flr = variant['init_flr']; policyType = variant['policyType']; use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant['tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks), "meta batch size wrong: " + str(meta_batch_size) + " <= " + str(len(all_tasks)) tasks = all_tasks[:meta_batch_size] print("^^^^^^^^^^^^^^^^ meta_tasks: ", tasks, " ^^^^^^^^^^^^^^^^ ") use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_PPO' in policyType: policy = PPO_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) print("|||||||||||||||||||||||||||||||||||||||||||||||", variant['n_itr']) beta_steps = 1 ; meta_step_size = 0.01 ; num_grad_updates = 1 pre_std_modifier = 1.0 ; post_std_modifier = 0.00001 limit_demos_num = None algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=variant['n_itr'], make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc, comet_logger=comet_log, outerIteration=variant['outer_Iteration'], use_ppo=True ) algo.train()
def experiment(variant, comet_exp_key=None): comet_logger = None if comet_exp_key is not None: # from rllab.misc.comet_logger import CometContinuedLogger, CometLogger # from comet_ml import Experiment, ExistingExperiment # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key']) comet_logger = ExistingExperiment( api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key']) # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", # project_name="ml4l3", workspace="glenb") comet_logger.set_name("test seq train") # comet_log = comet_exp_key print("RL!: ", comet_logger) print("%%%%%%%%%%%%%%%%%", comet_logger) seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$") if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) baseline = ZeroBaseline(env_spec=env.spec) # baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger, outer_iteration=variant['outer_iteration']) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, # step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={ 'init_learning_rate': default_step, 'tf_optimizer_args': { 'learning_rate': 0.5 * default_step }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # noise_opt = True, default_step=default_step, # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) else: raise AssertionError( 'Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] tasksFile = '/root/code/multiworld/multiworld/envs/goals/Door_60X20X20.pkl' tasks = pickle.load(open(tasksFile, 'rb')) baseEnv = SawyerDoorOpenEnv(tasks=tasks) env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, ) # import os # saveDir = variant['saveDir'] # if os.path.isdir(saveDir)==False: # os.mkdir(saveDir) # logger.set_snapshot_dir(saveDir) # #logger.set_snapshot_gap(20) # logger.add_tabular_output(saveDir+'progress.csv') algo.train()
def experiment(variant): seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) baseline = ZeroBaseline(env_spec=env.spec) #baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, #noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), #reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, #noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), #reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, #step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={ 'init_learning_rate': default_step, 'tf_optimizer_args': { 'learning_rate': 0.5 * default_step }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), #noise_opt = True, default_step=default_step, #reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) else: raise AssertionError( 'Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def experiment(variant, comet_logger=comet_logger): from sandbox.rocky.tf.algos.maml_il import MAMLIL from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy #from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import MAMLGaussianMLPPolicy as biasAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer from sandbox.rocky.tf.envs.base import TfEnv import sandbox.rocky.tf.core.layers as L from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv import tensorflow as tf import time from rllab.envs.gym_env import GymEnv from maml_examples.maml_experiment_vars import MOD_FUNC import numpy as np import random as rd import pickle import rllab.misc.logger as logger from rllab.misc.ext import set_seed import os seed = variant['seed'] n_parallel = 1 log_dir = variant['log_dir'] def setup(seed, n_parallel, log_dir): if seed is not None: set_seed(seed) if n_parallel > 0: from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=n_parallel) if seed is not None: parallel_sampler.set_seed(seed) if os.path.isdir(log_dir) == False: os.makedirs(log_dir, exist_ok=True) logger.set_snapshot_dir(log_dir) logger.add_tabular_output(log_dir + '/progress.csv') setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs'] meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps'] max_path_length = variant['max_path_length'] dagger = variant['dagger'] expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim'] init_flr = variant['init_flr'] policyType = variant['policyType'] use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks) tasks = all_tasks[:meta_batch_size] use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size= meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=variant['iterations'], make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc, comet_logger=comet_logger) algo.train() tf.reset_default_graph()