def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 100 num_grad_updates = 1 meta_step_size = variant['mlr'] env = TfEnv(normalize(PointEnvRandGoal())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, ) algo.train()
def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) expertDataLoc = variant['expertDataLoc'] expertDataItr = variant['expertDataItr'] fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] regionSize = variant['regionSize'] if regionSize == '20X20': tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_v1.pkl' else: assert regionSize == '60X30' tasksFile = '/root/code/multiworld/multiworld/envs/goals/PickPlace_60X30.pkl' tasks = pickle.load(open(tasksFile, 'rb')) envType = variant['envType'] if envType == 'Push': baseEnv = SawyerPushEnv(tasks=tasks) else: assert (envType) == 'PickPlace' baseEnv = SawyerPickPlaceEnv(tasks=tasks) env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, numExpertPolicies=20, expertDataInfo={ 'expert_loc': expertDataLoc, 'expert_itr': expertDataItr }) algo.train()
def experiment(variant): seed = variant['seed'] ; n_parallel = 1; log_dir = variant['log_dir'] setup(seed, n_parallel , log_dir) diff_post_policy = variant['diff_post_policy'] fast_batch_size = variant['fbs'] ; meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps'] ; max_path_length = variant['max_path_length'] dagger = variant['dagger'] ; expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim'] ; init_flr = variant['init_flr'] ; policyType = variant['policyType'] ; use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = '/home/code/multiworld/multiworld/envs/goals/' + variant['tasksFile']+'.pkl' #asksFile = '/home/russell/multiworld/multiworld/envs/goals/' + variant['tasksFile']+'.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size<=len(all_tasks) tasks = all_tasks[:meta_batch_size] use_images = 'conv' in policyType if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif 'MultiPush' in envType: baseEnv = SawyerMultiPushEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif 'Push' == envType: baseEnv = SawyerPushEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks = tasks , image = use_images , mpl = max_path_length , rewMode = 'l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv( tasks = tasks , image = use_images , mpl = max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl = max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push' , 'PickPlace' , 'Door' , 'SawyerMultiDomain' , 'Coffee' , 'SawyerMultiPush']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys) , reset_mode = 'task'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec = env.spec) load_policy = variant['load_policy'] hidden_sizes = variant['hidden_sizes'] if load_policy !=None: policy = None load_policy = variant['load_policy'] if 'conv' in load_policy: baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_mamlPolicy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=hidden_sizes, init_flr_full=init_flr, latent_dim=ldim ) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_mamlPolicy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=hidden_sizes, init_flr_full=init_flr, latent_dim=ldim ) elif 'basic' in policyType: policy = basic_mamlPolicy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=hidden_sizes, extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_mamlPolicy( name="policy", latent_dim = ldim, policyType = policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=hidden_sizes, extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) if diff_post_policy: post_policy = basic_policy( name="post_policy", env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=hidden_sizes, latent_dim = ldim, std_modifier = post_std_modifier ) assert 'biasAda_Bias' in policyType else: post_policy = None algo = algoClass( env=env, policy=policy, baseline=baseline, post_policy = post_policy, load_policy = load_policy, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=50, make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier = post_std_modifier, expert_trajs_dir= EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix = None, latent_dim = ldim, dagger = dagger , expert_policy_loc = expert_policy_loc ) algo.train()
def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] tasksFile = '/root/code/multiworld/multiworld/envs/goals/Door_60X20X20.pkl' tasks = pickle.load(open(tasksFile, 'rb')) baseEnv = SawyerDoorOpenEnv(tasks=tasks) env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, ) # import os # saveDir = variant['saveDir'] # if os.path.isdir(saveDir)==False: # os.mkdir(saveDir) # logger.set_snapshot_dir(saveDir) # #logger.set_snapshot_gap(20) # logger.add_tabular_output(saveDir+'progress.csv') algo.train()
def experiment(variant): seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) baseline = ZeroBaseline(env_spec=env.spec) #baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, #noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), #reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, #noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), #reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, #step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={ 'init_learning_rate': default_step, 'tf_optimizer_args': { 'learning_rate': 0.5 * default_step }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), #noise_opt = True, default_step=default_step, #reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) else: raise AssertionError( 'Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def experiment(variant): seed = variant['seed'] ; n_parallel = variant['n_parallel'] ; log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_learning_rate = variant['flr'] ; fast_batch_size = variant['fbs'] ; meta_batch_size = variant['mbs'] envClass = variant['envClass'] beta_steps = 1 adam_steps = variant['adam_steps'] updateMode = 'vec' adam_curve = None env_option = '' extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration" # extra_input = None extra_input_dim = 5 num_grad_updates = 1 meta_step_size = 0.01 pre_std_modifier = 1.0 post_std_modifier_train = 0.00001 post_std_modifier_test = 0.00001 l2loss_std_mult = 1.0 ism = '' limit_demos_num = 40 # 40 test_goals_mult = 1 bas_lr = 0.01 # baseline learning rate momentum=0.5 bas_hnl = tf.nn.relu hidden_layers = (100,100) basas = 60 # baseline adam steps use_corr_term = True # seeds = [1,2,3,4,5,6,7] #,2,3,4,5,6,7,8] #, 2,3,4,5,6,7,8] use_maml = True test_on_training_goals = False env = None if envClass == 'Ant': env = TfEnv(normalize(AntEnvRandGoalRing())) max_path_length = 200 EXPERT_TRAJ_LOCATION_DICT = '/root/code/rllab/saved_expert_traj/Expert_trajs_dense_ant/' elif envClass == 'SawyerPusher': baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None), obs_keys=['state_observation']) env = TfEnv(NormalizedBoxEnv(FinnMamlEnv( baseEnv , reset_mode = 'task'))) max_path_length = 150 EXPERT_TRAJ_LOCATION_DICT = '/root/code/maml_gps/saved_expert_traj/Expert_trajs_sawyer_pusher/' else: raise AssertionError('Env must be either Ant or SawyerPusher') policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), std_modifier=pre_std_modifier, # metalearn_baseline=(bas == "MAMLGaussianMLP"), extra_input_dim=(0 if extra_input is None else extra_input_dim), updateMode = updateMode, num_tasks = meta_batch_size ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLIL( env=env, policy=policy, #policy=None, #oad_policy='/home/alvin/maml_rl/data/local/R7-IL-0918/R7_IL_200_40_1_1_dem40_ei5_as50_basl_1809_04_27/itr_24.pkl', baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=200, #100 make_video=False, use_maml=use_maml, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=test_goals_mult, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=adam_curve, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[ism], post_std_modifier_train=post_std_modifier_train, post_std_modifier_test=post_std_modifier_test, expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT, #[env_option+"."+mode+goals_suffix], expert_trajs_suffix="", seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is None else extra_input_dim), updateMode = updateMode ) algo.train()