def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) joblib.dump(str(env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array( [0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8, -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28, 35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41, 43]), action_permutation=np.array( [-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8])) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed+MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) pposgd_mirror.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=0.0, positive_rew_enforce=False, #init_policy_params = joblib.load('data/ppo_DartWalker3d-v119_energy03_vel4_3s_mirror4_velrew3_damping5_anklesprint100_5_rotpen0_rew01xinit_stagedcurriculum4s75s34ratio/policy_params.pkl'), reward_drop_bound=True, ) env.close()
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) joblib.dump(str(env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\ 28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57, 59]), action_permutation=np.array([-6,7,-8, 9, 10,11, -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18])) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed + MPI.COMM_WORLD.Get_rank()) ref_policy_params = None '''init_q, init_dq = joblib.load('data/skel_data/init_states.pkl') env.env.env.init_qs = init_q env.env.env.init_dqs = init_dq''' if env.env.env.use_ref_policy: ref_policy_params = joblib.load( 'data/ppo_DartHumanWalker-v1210_energy015_vel65_6s_mirror_up01fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_avg_dcon1_asinput_damping2kneethigh_thigh150knee100_curriculum_1xjoint_shoulder90_dqpen00001/policy_params.pkl' ) gym.logger.setLevel(logging.WARN) pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=0.0, positive_rew_enforce=False, #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1333_energy03_vel2_2s_mirror4_velrew3_adaptalivebonus_rew1xinit_thigh160_100springankle_stagedcurriculum_05reduce_07rewthres/policy_params.pkl'), reward_drop_bound=True, ref_policy_params=ref_policy_params, #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1124_energy25_vel3_kd1000_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg3_dcontrolconstraint1_asinput_damping2_fromvel3_kd500/policy_params.pkl') ) env.close()
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([ 0.0001, -1, 2, -3, -4, 9, 10, 11, 12, 5, 6, 7, 8, 17, 18, 19, 20, 13, 14, 15, 16, 21, 22, -23, 24, -25, -26, 31, 32, 33, 34, 27, 28, 29, 30, 39, 40, 41, 42, 35, 36, 37, 38, 44, 43, 46, 45, 47 ]), action_permutation=np.array( [4, 5, 6, 7, 0.0001, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11]), ) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params=joblib.load( 'data/ppo_DartDogRobot-v1101_energy035_vel7_3s_mirror4_velrew3_ab11_rotqpen1105_08xtorque_12termination_hindleg0314limit_zrotterm12_rew05xinit_stagedcurriculum_075reduce_07rewthres/policy_params.pkl' ), reward_drop_bound=True, #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1124_energy25_vel3_kd1000_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg3_dcontrolconstraint1_asinput_damping2_fromvel3_kd500/policy_params.pkl') ) env.close()
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([ 0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8, -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28, 35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41, 43 ]), action_permutation=np.array([ -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8 ])) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1156_energy1_vel55_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg4_dcontrolconstraint1_asinput_damping2kneethigh_thigh250knee60/policy_params.pkl'), reward_drop_bound=True, #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1124_energy25_vel3_kd1000_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg3_dcontrolconstraint1_asinput_damping2_fromvel3_kd500/policy_params.pkl') ) env.close()
def main(): import argparse parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartDogRobot-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--init_policy', help='Initial Policy', default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl') parser.add_argument('--init_curriculum', help='Initial Curriculum', nargs='+', default=[2000.0, 2000]) parser.add_argument('--ref_policy', help='Reference Policy', default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl') parser.add_argument('--ref_curriculum', help='Reference Curriculum', nargs='+', default=[2000.0, 2000]) parser.add_argument('--anc_thres', help='Anchor Threshold', type=float, default=0.85) parser.add_argument('--prog_thres', help='Progress Threshold', type=float, default=0.7) parser.add_argument('--batch_size', help='Batch Size', type=int, default=2500) parser.add_argument('--max_iter', help='Maximum Iteration', type=int, default=2000) parser.add_argument('--use_reftraj', help='Use reference trajectory', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_curriculum_150eachit_vel8_mirror4_runningavg3_2s_torque13x_e1' + args.env + '_' + str( args.seed) + '_' + str(args.anc_thres) + '_' + str(args.prog_thres) + '_' + str(args.batch_size)) sess = U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) ob_space = env.observation_space ac_space = env.action_space def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array( [0.0001, -1, 2, -3, -4, 9, 10, 11, 12, 5, 6, 7, 8, 17, 18, 19, 20, 13, 14, 15, 16, 21, 22, -23, 24, -25, -26, 31, 32, 33, 34, 27, 28, 29, 30, 39, 40, 41, 42, 35, 36, 37, 38, 44, 43, 46, 45, 47]), action_permutation=np.array( [4, 5, 6, 7, 0.0001, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11])) policy = policy_fn('policy', ob_space, ac_space) init_curriculum = np.array(args.init_curriculum) ref_policy = policy_fn('ref_policy', ob_space, ac_space) ref_curriculum = np.array(args.ref_curriculum) policy_params = joblib.load(args.init_policy) ref_policy_params = joblib.load(args.ref_policy) U.initialize() cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].name.find('/')] orig_scope = list(policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')] ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params.keys())[0].find('/')] for i in range(len(policy.get_variables())): assign_op = policy.get_variables()[i].assign( policy_params[policy.get_variables()[i].name.replace(cur_scope, orig_scope, 1)]) sess.run(assign_op) assign_op = ref_policy.get_variables()[i].assign( ref_policy_params[ref_policy.get_variables()[i].name.replace('ref_' + cur_scope, ref_scope, 1)]) sess.run(assign_op) anchor_threshold = args.anc_thres progress_threshold = args.prog_thres env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) curriculum_evolution = [] env.env.env.anchor_kp = ref_curriculum ref_score = None ref_max_score = None reference_trajectory = None # if MPI.COMM_WORLD.Get_rank() == 0: if args.use_reftraj == 1: reference_trajecotry = gen_reftraj(env, ref_policy, 299) env.env.reference_trajectory = reference_trajectory ref_score, ref_max_score = evaluate_policy(env, ref_policy, 24) ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0) ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0) reference_score = ref_score * progress_threshold reference_anchor_score = ref_score * anchor_threshold reference_max_score = ref_max_score * 0.9 env.env.env.anchor_kp = init_curriculum reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory current_curriculum = np.copy(init_curriculum) print('reference scores: ', reference_score, reference_anchor_score, reference_max_score) #env.env.env.energy_weight *= 0.5 # env.env.env.final_tv -= 0.5 previous_params = policy_params for iter in range(args.max_iter): print('curriculum iter ', iter) print('ref score: ', reference_anchor_score) opt_pi, final_rew = pposgd_mirror.learn(env, policy_fn, max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150, timesteps_per_batch=int(args.batch_size), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, return_threshold=reference_anchor_score, init_policy_params=previous_params, policy_scope='pi' + str(iter), min_iters=0, reward_drop_bound=True, # max_threshold = reference_max_score, ) print('one learning iteration done') if np.linalg.norm(current_curriculum) >= 0.0001: # re-compute reference trajectory if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1: print('recompute reference traj') reference_trajecotry = gen_reftraj(env, opt_pi, 299) reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory if final_rew < reference_anchor_score * 0.95: print('update reference scores') reference_score = reference_score / reference_anchor_score * final_rew reference_anchor_score = final_rew closest_candidate = None # if MPI.COMM_WORLD.Get_rank() == 0: directions = [np.array([-1, 0]), np.array([0, -1]), -current_curriculum / np.linalg.norm(current_curriculum)] int_d1 = directions[0] + directions[2] int_d2 = directions[1] + directions[2] directions.append(int_d1 / np.linalg.norm(int_d1)) directions.append(int_d2 / np.linalg.norm(int_d2)) # directions = [np.array([0.0, -1.0])] # only search in one direction candidate_next_anchors = [] for direction in directions: found_point, perf = binary_search_curriculum(env, opt_pi, current_curriculum, direction, reference_score, reference_max_score, 6) print(direction, found_point, perf) candidate_next_anchors.append(found_point) if closest_candidate is None: closest_candidate = np.copy(found_point) elif np.linalg.norm(closest_candidate) > np.linalg.norm(found_point): closest_candidate = np.copy(found_point) if np.linalg.norm(closest_candidate) < 0.5: closest_candidate = np.array([0, 0]) if np.abs(closest_candidate[0]) < 0.1: closest_candidate[0] = 0.0 if np.abs(closest_candidate[1]) < 0.1: closest_candidate[1] = 0.0 # closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0) current_curriculum = np.copy(closest_candidate) env.env.env.anchor_kp = current_curriculum '''print('Update Init Pose Distributions') update_init_poses(env, opt_pi) if MPI.COMM_WORLD.Get_rank() == 0: joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True) joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)''' curriculum_evolution.append(current_curriculum) print('Current curriculum: ', current_curriculum) opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val if np.linalg.norm(current_curriculum) < 0.0001: if reference_anchor_score < ref_score: reference_anchor_score = ref_score else: break env.close()
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array( [0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9, 10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, \ 28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36, 37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58, 57, 59]), action_permutation=np.array( [-6, 7, -8, 9, 10, 11, -0.001, 1, -2, 3, 4, 5, -12, 13, -14, -19, 20, -21, 22, -15, 16, -17, 18])) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) previous_params = None iter_num = 0 last_iter = False # if initialize from previous runs #previous_params = joblib.load('') #env.env.env.assist_schedule = [] joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) reward_threshold = None while True: if not last_iter: rollout_length_thershold = env.env.env.assist_schedule[2][ 0] / env.env.env.dt else: rollout_length_thershold = None opt_pi, rew = pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params=previous_params, reward_drop_bound=True, rollout_length_thershold=rollout_length_thershold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) if iter_num == 0: reward_threshold = 0.7 * rew if last_iter: reward_threshold = None iter_num += 1 opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val # update the assist schedule for s in range(len(env.env.env.assist_schedule) - 1): env.env.env.assist_schedule[s][1] = np.copy( env.env.env.assist_schedule[s + 1][1]) env.env.env.assist_schedule[-1][1][0] *= 0.75 env.env.env.assist_schedule[-1][1][1] *= 0.75 if env.env.env.assist_schedule[-1][1][0] < 5.0: env.env.env.assist_schedule[-1][1][0] = 0.0 if env.env.env.assist_schedule[-1][1][1] < 5.0: env.env.env.assist_schedule[-1][1][1] = 0.0 zero_assist = True for s in range(len(env.env.env.assist_schedule) - 1): for v in env.env.env.assist_schedule[s][1]: if v != 0.0: zero_assist = False print('Current Schedule: ', env.env.env.assist_schedule) if zero_assist: last_iter = True print('Entering Last Iteration!') env.close()
def train_mirror(args, num_timesteps): from baselines.ppo1 import mlp_mirror_policy, mlp_mirror_norms_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) env.env._seed(args.seed + MPI.COMM_WORLD.Get_rank()) env.env.init_params(args) U.ALREADY_INITIALIZED = set() U.ALREADY_INITIALIZED.update(set(tf.global_variables())) obs_per = np.array([0.0001, -1, 2, -3, -4, 11, 12, 13, 14, 15, 16, 5, 6, 7, 8, 9, 10, -17, 18, -19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, -30, 31, -32, -33, 40, 41, 42, 43, 44, 45, 34, 35, 36, 37, 38, 39, -46, 47, -48, 53, 54, 55, 56, 49, 50, 51, 52]) if env.env.include_additional_info: obs_per = np.concatenate((obs_per, np.array([58, 57]))) obs_per = np.concatenate((obs_per, np.array([59]))) obs_per = np.concatenate((obs_per, np.array([63, 64, -65, 60, 61, -62]))) obs_per = np.concatenate((obs_per, np.array([66, 67, -68]))) obs_per = np.concatenate((obs_per, np.array([72, 73, -74, 69, 70, -71]))) obs_per = np.concatenate((obs_per, np.array([75, 76, -77]))) obs_per = np.concatenate((obs_per, np.array([78, 79, -80]))) assert env.env.obs_dim == (57 + 3 + 3 * 6 + 3) assert env.env.act_dim == 97 # change action/state permutation if change action/state in env def policy_fn(name, ob_space, ac_space): old_act_permute = [-86, 87, -88, 93, 94, 95, 96, 89, 90, 91, 92] mus_act_l = np.arange(43) mus_act_r = mus_act_l + 43 mus_act_l[0] = 0.001 act_permute = np.concatenate([mus_act_r, mus_act_l, old_act_permute]) if env.env.env.state_self_standardize: return mlp_mirror_norms_policy.MlpMirrorNormsPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1, mirror_loss=True, observation_permutation=obs_per, action_permutation=act_permute) else: return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1, mirror_loss=True, observation_permutation=obs_per, action_permutation=act_permute) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) with open(logger.get_dir() + '/env_specs.txt', 'w') as f: pprint.pprint(env.env.env.__dict__, f) f.close() shutil.copyfile(env.env.env.model_file_name, logger.get_dir() + '/using_model.skel') cur_sym_loss = 3.0 iter_num = 0 previous_params = None # previous_params = joblib.load('') reward_threshold = None rollout_length_threshold = None pposgd_mirror.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2000), clip_param=args.clip, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=cur_sym_loss, init_policy_params=previous_params, reward_drop_bound=None, rollout_length_threshold=rollout_length_threshold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) env.close()
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) env.env.assist_timeout = 100.0 def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([ 0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8, -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28, 35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41, 43 ]), action_permutation=np.array([ -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8 ])) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) previous_params = None iter_num = 0 last_iter = False # if initialize from previous runs '''previous_params = joblib.load('data/ppo_DartWalker3d-v111_energy04_vel1_1s_mirror4_velrew3_damping5_anklesprint100_5_rotpen1_rew01xinit_stagedcurriculum/policy_params.pkl') env.env.env.assist_schedule = [[0.0,np.array([250.,125.])],[3.0,np.array([125.,62.5])],[6.0,[62.5,31.25]]]''' joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) reward_threshold = None while True: if not last_iter: rollout_length_threshold = env.env.env.assist_schedule[2][ 0] / env.env.env.dt else: rollout_length_threshold = None opt_pi, rew = pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params=previous_params, reward_drop_bound=500, rollout_length_thershold=rollout_length_threshold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) if iter_num == 0: reward_threshold = 0.7 * rew if last_iter: reward_threshold = None iter_num += 1 opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val # update the assist schedule for s in range(len(env.env.env.assist_schedule) - 1): env.env.env.assist_schedule[s][1] = np.copy( env.env.env.assist_schedule[s + 1][1]) env.env.env.assist_schedule[-1][1][0] *= 0.75 env.env.env.assist_schedule[-1][1][1] *= 0.75 if env.env.env.assist_schedule[-1][1][0] < 5.0: env.env.env.assist_schedule[-1][1][0] = 0.0 if env.env.env.assist_schedule[-1][1][1] < 5.0: env.env.env.assist_schedule[-1][1][1] = 0.0 zero_assist = True for s in range(len(env.env.env.assist_schedule) - 1): for v in env.env.env.assist_schedule[s][1]: if v != 0.0: zero_assist = False print('Current Schedule: ', env.env.env.assist_schedule) if zero_assist: last_iter = True print('Entering Last Iteration!') env.close()
def train_mirror_sig(env, num_timesteps, seed, obs_perm, act_perm): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=obs_perm, action_permutation=act_perm) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed+MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) previous_params = None iter_num = 0 last_iter = False # if initialize from previous runs #previous_params = joblib.load('') #env.env.env.assist_schedule = [] joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) reward_threshold = None while True: if not last_iter: rollout_length_thershold = env.env.env.assist_schedule[2][0] / env.env.env.dt else: rollout_length_thershold = None reward_threshold *= 1.2 opt_pi, rew = pposgd_mirror.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params = previous_params, reward_drop_bound=500, rollout_length_thershold = rollout_length_thershold, policy_scope='pi' + str(iter_num), return_threshold = reward_threshold, ) if iter_num == 0: reward_threshold = 0.7 * rew if last_iter: break iter_num += 1 opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val # update the assist schedule for s in range(len(env.env.env.assist_schedule)-1): env.env.env.assist_schedule[s][1] = np.copy(env.env.env.assist_schedule[s+1][1]) env.env.env.assist_schedule[-1][1][0] *= 0.75 env.env.env.assist_schedule[-1][1][1] *= 0.75 if env.env.env.assist_schedule[-1][1][0] < 5.0: env.env.env.assist_schedule[-1][1][0] = 0.0 if env.env.env.assist_schedule[-1][1][1] < 5.0: env.env.env.assist_schedule[-1][1][1] = 0.0 zero_assist = True for s in range(len(env.env.env.assist_schedule)-1): for v in env.env.env.assist_schedule[s][1]: if v != 0.0: zero_assist = False print('Current Schedule: ', env.env.env.assist_schedule) if zero_assist: last_iter = True print('Entering Last Iteration!') env.close()
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=2).__enter__( ) #originally 1 CPU -- creates session with X CPUs set_global_seeds(seed) env = gym.make(env_id) #creates gym env -- Here it is walker3d.py env.env.assist_timeout = 100.0 def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([ 0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8, -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28, 35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41, 43 ]), action_permutation=np.array([ -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8 ])) # initializes a random MLP Mirror Policy aka the good stuff env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) previous_params = None #interesting -- can i use it to use prev params as starting point? iter_num = 0 last_iter = False # if initialize from previous runs '''previous_params = joblib.load('data/ppo_DartWalker3d-v111_energy04_vel1_1s_mirror4_velrew3_damping5_anklesprint100_5_rotpen1_rew01xinit_stagedcurriculum/policy_params.pkl') env.env.env.assist_schedule = [[0.0,np.array([250.,125.])],[3.0,np.array([125.,62.5])],[6.0,[62.5,31.25]]]''' #sets params from a previous run -- choose starting schedule too! previous_params = joblib.load( 'data/Walker_0_to_1/[[0.0,array([0.,0.])],[3.0,array([0.,0.])],[6.0,[0.0,0.0]]]/policy_params_6400.pkl' ) env.env.env.assist_schedule = [[0.0, np.array([2000, 2000])], [3.0, np.array([1500, 1500])], [6.0, [1125.0, 1125.0]]] joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) reward_threshold = None while True: #do the actual training if not last_iter: rollout_length_threshold = env.env.env.assist_schedule[2][ 0] / env.env.env.dt #if not last iter, update rew length threshold else: rollout_length_threshold = None opt_pi, rew = pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params=previous_params, reward_drop_bound=500, rollout_length_thershold=rollout_length_threshold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) # Use Neural Net, to get new policy and reward -- Notice Callback function to save policy if iter_num == 0: reward_threshold = 0.7 * rew #if new schedule, reward to attain is 70% of old reward if last_iter: #if last iter, no threshold reward_threshold = None iter_num += 1 opt_variable = opt_pi.get_variables() #get policy's GLOBAL variables previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() #curr variable = variable's VALUE previous_params[ opt_variable[i]. name] = cur_val # Previous = dictionnary var name -> var value # update the assist schedule for s in range(len(env.env.env.assist_schedule) - 1): env.env.env.assist_schedule[s][1] = np.copy( env.env.env.assist_schedule[s + 1][1]) env.env.env.assist_schedule[-1][1][0] *= 0.75 env.env.env.assist_schedule[-1][1][1] *= 0.75 if env.env.env.assist_schedule[-1][1][0] < 5.0: env.env.env.assist_schedule[-1][1][0] = 0.0 if env.env.env.assist_schedule[-1][1][1] < 5.0: env.env.env.assist_schedule[-1][1][1] = 0.0 zero_assist = True for s in range( len(env.env.env.assist_schedule) - 1 ): #check if we are in last iteration (all 0 schedules) -- if yes set last iter to true and notify the user for v in env.env.env.assist_schedule[s][1]: if v != 0.0: zero_assist = False print('Current Schedule: ', env.env.env.assist_schedule) logger.log("Current Schedule: %s" % env.env.env.assist_schedule ) #added by me. keep track of schedule in log file if zero_assist: last_iter = True print('Entering Last Iteration!') logger.log( "Entering Last Iteration" ) #added by me, helps know in logger when hit last schedule env.close() #when done close envt