def get_robustness(policy, env_name, fric_fractions=[1.0], fric_bodies=[b'foot'], mass_fractions=[1.0], mass_bodies=[b'torso'], num_evals=5): P = policy #M=[];V=[]; M = np.zeros((len(fric_fractions), len(mass_fractions))) V = np.zeros((len(fric_fractions), len(mass_fractions))) fis = np.zeros((len(fric_fractions), len(mass_fractions))) mis = np.zeros((len(fric_fractions), len(mass_fractions))) for fi, f in enumerate(fric_fractions): for mi, m in enumerate(mass_fractions): print('{}/{}'.format((fi * len(mass_fractions)) + mi, len(mass_fractions) * len(fric_fractions))) env = normalize(GymEnv(env_name, 1.0)) e = np.array(env.wrapped_env.env.model.geom_friction) fric_ind = env.wrapped_env.env.model.body_names.index( fric_bodies[0]) e[fric_ind, 0] = e[fric_ind, 0] * f env.wrapped_env.env.model.geom_friction = e me = np.array(env.wrapped_env.env.model.body_mass) mass_ind = env.wrapped_env.env.model.body_names.index( mass_bodies[0]) me[mass_ind, 0] = me[mass_ind, 0] * m env.wrapped_env.env.model.body_mass = me t = [] for _ in range(num_evals): t.append(test_const_adv(env, P, 1000, 1)) t = np.array(t) M[fi, mi] = t.mean() V[fi, mi] = t.std() fis[fi, mi] = e[fric_ind, 0] mis[fi, mi] = me[mass_ind, 0] return M, V, fis, mis
batch_size=batch_size, max_path_length=path_length, n_itr=n_adv_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=False, scope='adversary_optim' ) ## Setting up summaries for testing for a specific training instance ## pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append(test_const_adv(env_orig, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append(test_rand_adv(env_orig, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append(test_step_adv(env_orig, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append(test_rand_step_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append(test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) ## Beginning alternating optimization ## for ni in range(n_itr): logger.log('\n\nExperiment: {} Iteration: {}\n'.format(ne,ni,)) ## Train Protagonist pro_algo.train()
adv_policy=zero_adv_policy, pro_baseline=pro_baseline, adv_baseline=pro_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True) ## Joint optimization ## if ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True) pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews = []
def train(num_experiments, thread_id, queue): ############ DEFAULT PARAMETERS ############ env_name = None #Name of adversarial environment path_length = 1000 #Maximum episode length layer_size = tuple([100, 100, 100]) #Layer definition ifRender = False #Should we render? afterRender = 100 #After how many to animate n_exps = 1 #Number of training instances to run n_itr = 25 #Number of iterations of the alternating optimization n_pro_itr = 1 #Number of iterations for the protaginist n_adv_itr = 1 #Number of interations for the adversary batch_size = 4000 #Number of training samples for each iteration ifSave = True #Should we save? save_every = 100 #Save checkpoint every save_every iterations n_process = 1 #Number of parallel threads for sampling environment adv_fraction = 0.25 #Fraction of maximum adversarial force to be applied step_size = 0.01 #kl step size for TRPO gae_lambda = 0.97 #gae_lambda for learner save_dir = './results' #folder to save result in ############ ENV SPECIFIC PARAMETERS ############ env_name = 'HopperAdv-v1' layer_size = tuple([64, 64]) step_size = 0.01 gae_lambda = 1.0 batch_size = 25000 n_exps = num_experiments n_itr = 500 ifSave = False n_process = 4 adv_fraction = 3.0 save_dir = './../results/StaticHopper' args = [ env_name, path_length, layer_size, ifRender, afterRender, n_exps, n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process, adv_fraction, step_size, gae_lambda, save_dir ] ############ ADVERSARIAL POLICY LOAD ############ filepath = './../initial_results/Hopper/env-HopperAdv-v1_Exp1_Itr500_BS25000_Adv0.25_stp0.01_lam1.0_369983.p' res_D = pickle.load(open(filepath, 'rb')) pretrained_adv_policy = res_D['adv_policy'] ############ MAIN LOOP ############ ## Initializing summaries for the tests ## const_test_rew_summary = [] rand_test_rew_summary = [] step_test_rew_summary = [] rand_step_test_rew_summary = [] adv_test_rew_summary = [] ## Preparing file to save results in ## save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format( env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda, random.randint(0, 1000000)) save_name = save_dir + '/' + save_prefix ## Looping over experiments to carry out ## for ne in range(n_exps): ## Environment definition ## ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Adversary policy definition ## adv_policy = pretrained_adv_policy adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Optimizer for the Protagonist ## pro_algo = TRPO(env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True) ## Setting up summaries for testing for a specific training instance ## pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append( test_const_adv(env_orig, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append( test_rand_adv(env_orig, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env_orig, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append( test_rand_step_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) ## Beginning alternating optimization ## for ni in range(n_itr): logger.log('\n\nThread: {} Experiment: {} Iteration: {}\n'.format( thread_id, ne, ni, )) ## Train Protagonist pro_algo.train() pro_rews += pro_algo.rews all_rews += pro_algo.rews logger.log('Protag Reward: {}'.format( np.array(pro_algo.rews).mean())) ## Test the learnt policies const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) if ni % afterRender == 0 and ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True) if ni != 0 and ni % save_every == 0 and ifSave == True: ## SAVING CHECKPOINT INFO ## pickle.dump( { 'args': args, 'pro_policy': pro_policy, 'adv_policy': adv_policy, 'zero_test': [const_testing_rews], 'rand_test': [rand_testing_rews], 'step_test': [step_testing_rews], 'rand_step_test': [rand_step_testing_rews], 'iter_save': ni, 'exp_save': ne, 'adv_test': [adv_testing_rews] }, open(save_name + '_' + str(ni) + '.p', 'wb')) ## Shutting down the optimizer ## pro_algo.shutdown_worker() ## Updating the test summaries over all training instances const_test_rew_summary.append(const_testing_rews) rand_test_rew_summary.append(rand_testing_rews) step_test_rew_summary.append(step_testing_rews) rand_step_test_rew_summary.append(rand_step_testing_rews) adv_test_rew_summary.append(adv_testing_rews) queue.put([ const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary, rand_step_test_rew_summary, adv_test_rew_summary ]) ############ SAVING MODEL ############ '''
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--max_path_length', type=int, default=1000, help='Max length of rollout') parser.add_argument('--speedup', type=float, default=1, help='Speedup') args = parser.parse_args() # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.Session(): # [rest of the code] with tf.Session() as sess: data = joblib.load(args.file) pro_policy = data['pro_policy'] args_pickle = data['args'] env = normalize(GymEnv(args_pickle.env, 0)) while True: print( test_const_adv(env, pro_policy, path_length=args_pickle.path_length, n_traj=5, render=False, speedup=10000)) if not query_yes_no('Continue simulation?'): break
pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_adv_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=False, scope='adversary_optim' ) ## Joint optimization ## if ifRender==True: test_const_adv(env, pro_policy, path_length=path_length, n_traj = 1, render=True) pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append(test_const_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append(test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) #embed() for ni in range(n_itr): logger.log('\n\n\n####expNO{}_{} global itr# {}####\n\n\n'.format(ne,adv_name,ni)) adv_algo.train() adv_rews += adv_algo.rews; all_rews += adv_algo.rews; logger.log('Advers Reward: {}'.format(np.array(adv_algo.rews).mean())) pro_res_temp = test_const_adv(env, pro_policy, path_length=path_length) logger.log('Protag Reward No Adversary: {}'.format(pro_res_temp))