def main(env_name, seed, run_num, data_saving_path, batch_size_per_process, num_iterations, autoencoder_base="./novelty_data/local/autoencoders/"): num_processes = MPI.COMM_WORLD.Get_size() num_timesteps_per_process = batch_size_per_process num_iterations_enforce = num_iterations import baselines.common.tf_util as U comm = MPI.COMM_WORLD mpi_rank = comm.Get_rank() tf.reset_default_graph() with U.single_threaded_session() as sess: autoencoder_list = [] for i in range(run_num): autoencoder_model = load_model( autoencoder_base + env_name + '_autoencoder_seed_' + str(seed) + '_run_' + str(i) + '.h5') autoencoder_list.append(autoencoder_model) U.ALREADY_INITIALIZED.update(set(tf.global_variables())) logger.reset() # logger.configure( # '../data/ppo_' + enforce_env_name + '_autoencoder_' + str(len(autoencoder_list)) + '_seed=' + str( # seed) + '/' + str(st)) logger.configure(data_saving_path) model = train(sess, env_name, num_timesteps=num_iterations_enforce * num_processes * num_timesteps_per_process, timesteps_per_actor=num_timesteps_per_process, autoencoders=autoencoder_list, seed=seed) # if mpi_rank == 0: env = gym.make(env_name) env.env.novel_autoencoders = autoencoder_list if hasattr(env.env, 'disableViewer'): env.env.disableViewer = False env = wrappers.Monitor(env, logger.get_dir() + '/results', force=True) obs = env.reset() step = 0 while (True): env.render() actions = model._act(False, obs) obs, _, done, _ = env.step(actions[0][0]) env.render() if done: obs = env.reset() if done: print("Visualization is Done") break step += 1
def main(): import argparse parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHumanWalkerMD-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--hsize', type=int, default=80) parser.add_argument('--layers', type=int, default=2) parser.add_argument('--clip', type=float, default=0.2) parser.add_argument('--HW_final_tar_v', help='final target velocity', type=float, default=1.7) parser.add_argument('--HW_tar_acc_time', help='time to acc to final target velocity', type=float, default=1.1) parser.add_argument('--HW_energy_weight', help='energy pen weight', type=float, default=0.5) parser.add_argument('--HW_alive_bonus_rew', help='alive bonus weight', type=float, default=7.0) parser.add_argument('--HW_vel_reward_weight', help='velocity pen weight', type=float, default=9.0) parser.add_argument('--HW_side_devia_weight', help='side deviation pen weight', type=float, default=1.5) parser.add_argument('--HW_jl_pen_weight', help='joint limit pen weight', type=float, default=0.7) parser.add_argument('--HW_alive_pen', help='alive pen weight', type=float, default=0.0) args = parser.parse_args() logger.reset() import datetime now = datetime.datetime.now() stampstring = now.isoformat() logdir = 'data/wtoe_MD_20080_ppo_noAssist_adds' + stampstring[:16] + args.env + '_' + str( args.seed) + '_' + str(args.hsize) + '-' + str(args.layers) + '_' + str(args.clip) for arg in vars(args): if arg[:3] == 'HW_': logdir += arg[2:6] logdir += '_' logdir += arg[-3:] logdir += str(getattr(args, arg)) logger.configure(logdir) train_mirror(args, num_timesteps=int(2000 * 8 * 1600))
def main(): import argparse parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHexapod-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy03_vel15_15s_mirror4_velrew3_rew01xinit_thigh200_100springankle_stagedcurriculum') train_mirror(args.env, num_timesteps=int(5000*4*800), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartWalker3d-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy03_vel5_3s_mirror0_velrew3_asinput_damping5_ab7_torque1x_anklesprint100_5_rotpen01_rew01xinit') train_mirror(args.env, num_timesteps=int(5000*4*2500), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHexapod-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_' + args.env + str(args.seed) + '_energy005_vel8_mirror_velrew3_asinput') train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHopper-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_' + args.env + str(args.seed) + '_using_disc_ref_policy_iter_2') train(args.env, num_timesteps=int(5000 * 40), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHumanWalker-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_' + args.env + str(args.seed) + '_vf_vanilla_weak_2k') #logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy05_bal_vel4smooth_mirror_up1fwd01ltl1_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_dcontrolconstraint1_strongerarm_asinput_treadmill') train(args.env, num_timesteps=int(500 * 4 * 100), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHumanWalker-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_' + args.env + str(args.seed) + '_energy03_vel15_1s_mirror4_velrew3_ab6_norotpen_dofpen11508_rew05xinit_thigh160_50springankle_1p2term_stagedcurriculum_075reduce_07rewthres_2kassist' ) train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartDogRobot-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_' + args.env + str(args.seed) + '_energy02_vel2_2s_mirror4_velrew3_ab4_norotpen_rew01xinit_stagedcurriculum_075reduce_07rewthres' ) train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartWalker3d-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_' + args.env + str(args.seed) + '_energy04_vel1_1s_mirror4_velrew3_ab4_anklesprint100_5_rotpen0_rew05xinit_stagedcurriculum4s75s34ratio' ) train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartWalker3dSPD-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_' + args.env + str(args.seed) + '_energy003_vel15_mirror4_velrew3_spd1k300_kd001_nocurriculum_frameskip5' ) train_mirror(args.env, num_timesteps=int(5000 * 4 * 1500), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHumanWalker-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_' + args.env + str(args.seed) + '_energy03_vel15_1s_mirror0_up03fwd03ltl15_spinepen1yaw001_ab3_thighyawpen005_velrewavg3_2s_dcon1_damping2kneethigh_thigh160knee100waist150_shoulder100_velrew15xinit_baseline' ) #logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy05_bal_vel4smooth_mirror_up1fwd01ltl1_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_dcontrolconstraint1_strongerarm_asinput_treadmill') train_mirror(args.env, num_timesteps=int(5000 * 4 * 2500), seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). ' 'This number gets divided by 4 due to frameskip', type=int, default=200) parser.add_argument('--logdir', help='Log directory', type=str, default="logs") parser.add_argument('--note', help='A short note to add to the log file', type=str, default="") parser.add_argument('--model_path', help='Path to pre-trained model', type=str, default="") parser.add_argument( '--num_cpus', help='Number of CPUs (i.e. number of parallel enviornments)', type=int, default=16) parser.add_argument('--nsteps', help='Number of steps for each rollout', type=int, default=1) parser.add_argument('--lr', help='Learning rate', type=float, default=1.5e-3) parser.add_argument('--pg_coef', help='Coefficient for policy gradient loss', type=float, default=0.1) parser.add_argument('--ent_coef', help='Coefficient for policy entropy loss', type=float, default=0.001) parser.add_argument('--vf_coef', help='Coefficient for value function loss', type=float, default=0.5) args = parser.parse_args() timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f") logdir = os.path.join(args.logdir, args.env, timestamp) logger.reset() logger.configure(logdir) logger.log("") for arg in sorted(vars(args)): logger.log("{}: {}".format(arg, getattr(args, arg))) logger.log("") train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, nsteps=args.nsteps, policy=args.policy, lrschedule=args.lrschedule, num_cpu=args.num_cpus, model_path=args.model_path, lr=args.lr, pg_coef=args.pg_coef, ent_coef=args.ent_coef, vf_coef=args.vf_coef)
def main(): path = 'data/value_iter_cartpole_discrete_v4' logger.reset() logger.configure(path) env = gym.make('DartCartPoleSwingUp-v1') env.seed(0) #obs_disc = bin_disc([[50, 0, -0.01], [50, 0.0, -0.01]]) #act_disc = bin_disc([[10, 1.01, -1.01]]) obs_disc = bin_disc([[50, 0, -0.01], [50, 0.0, -0.01], [50, 0.0, -0.01], [50, 0.0, -0.01]]) act_disc = bin_disc([[50, 1.01, -1.01]]) '''s_disc = [] for i in range(11): s_disc.append([30, 0.0, -0.0]) obs_disc = bin_disc(s_disc) act_disc = bin_disc([[10, 1.01, -1.01], [10, 1.01, -1.01], [10, 1.01, -1.01]]) #obs_disc = bin_disc([[5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1]]) #act_disc = bin_disc([[4, 1.0, -1.0], [4, 1.0, -1.0], [4, 1.0, -1.0], [4, 1.0, -1.0], [4, 1.0, -1.0]])''' obs_disc_dim = 1 act_disc_dim = 1 for s in obs_disc.disc_scheme: obs_disc_dim *= s[0] for s in act_disc.disc_scheme: act_disc_dim *= s[0] state_filter_fn = state_filter_cartpole state_unfilter_fn = state_unfilter_cartpole policy = None '''sess = tf.InteractiveSession() policy_params = joblib.load( 'data/ppo_DartCartPoleSwingUp-v11_vanilla/policy_params.pkl') ob_space = env.observation_space ac_space = env.action_space policy = policy_fn("pi", ob_space, ac_space) U.initialize() cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].name.find('/')] orig_scope = list(policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')] vars = policy.get_variables() for i in range(len(policy.get_variables())): assign_op = policy.get_variables()[i].assign( policy_params[policy.get_variables()[i].name.replace(cur_scope, orig_scope, 1)]) sess.run(assign_op) env.env.use_disc_ref_policy = None''' dyn_model, col_data, obs_disc = learn_model(env, obs_disc, obs_disc_dim, act_disc, act_disc_dim, state_filter_fn, state_unfilter_fn, policy=policy, disc_policy = False) Vfunc, policy = optimize_policy(dyn_model, 0.99) for iter in range(50): print('--------------- Iteration ', str(iter), ' -------------------') dyn_model, col_data, obs_disc = learn_model(env, obs_disc, obs_disc_dim, act_disc, act_disc_dim, state_filter_fn, state_unfilter_fn, policy = policy, collected_data=col_data) Vfunc, policy = optimize_policy(dyn_model, 0.99, Vfunc = Vfunc) joblib.dump(dyn_model, path+'/dyn_model_'+str(iter)+'.pkl', compress=True) joblib.dump(policy, path + '/policy_'+str(iter)+'.pkl', compress=True) joblib.dump([Vfunc, obs_disc, act_disc, state_filter_fn, state_unfilter_fn], path + '/ref_policy_funcs_'+str(iter)+'.pkl', compress=True) joblib.dump(dyn_model, path + '/dyn_model.pkl', compress=True) joblib.dump(policy, path + '/policy.pkl', compress=True) joblib.dump([Vfunc, obs_disc, act_disc, state_filter_fn, state_unfilter_fn], path + '/ref_policy_funcs.pkl', compress=True) joblib.dump(dyn_model, path + '/dyn_model.pkl', compress=True) joblib.dump(policy, path + '/policy.pkl', compress=True) joblib.dump([Vfunc, obs_disc, act_disc, state_filter_fn, state_unfilter_fn], path + '/ref_policy_funcs.pkl', compress=True)
def learn(env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) load_saved_model_dir=None ): ##logger setup logger.reset() log_dir = os.path.join(str(Path.home()), "Desktop", "Darksouls" + "ppo",datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) logger.configure(log_dir, ["tensorboard", "stdout"]) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = spaces.Discrete(9) pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.compat.v1.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() if load_saved_model_dir is not None: U.load_state(load_saved_model_dir+"/saved_model") print("Loaded saved model at: ",load_saved_model_dir) adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() print("Training") add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("LastEpRew", rews[-1]) episodes_so_far += len(lens) timesteps_so_far += sum(lens) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) iters_so_far += 1 if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='SunblazeCartPoleRandomNormal-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--output', type=str) parser.add_argument('--processes', default=1, help='int or "max" for all') # EPOpt specific parser.add_argument('--epsilon', type=float, default=1.0) # EPOpt paper keept epsilon=1 until iters>100 (max 200 iters) parser.add_argument('--activate', type=int, default=100, help='How long to fix epsilon to 1.0 before e') parser.add_argument( '--paths', type=int, default=100, help='number of trajectories to sample from each iteration') parser.add_argument('--algorithm', type=str, choices=['ppo2', 'a2c'], default='ppo2', help='Inner batch policy optimization algorithm') parser.add_argument('--policy', choices=['mlp', 'lstm'], default='mlp', help='Policy architecture') # Episode-modification specific: # parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--total-episodes', type=int, default=5e4) # RL algo. yyperparameters parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--nsteps', type=int, default=2048) parser.add_argument('--ent-coef', type=float, default=1e-2, help='Only relevant for A2C') parser.add_argument('--nminibatches', type=int, default=32, help='Only relevant for PPO2') args = parser.parse_args() # Configure logger if args.output: try: os.makedirs(args.output) except OSError: pass logger.reset() logger.configure(dir=args.output) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.output, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) if args.processes == 'max': ncpu = multiprocessing.cpu_count() # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15 if sys.platform == 'darwin': ncpu //= 2 else: try: ncpu = int(args.processes) except ValueError: raise argparse.ArgumentTypeError("Invalid number of processes") train_epopt( args.env, total_episodes=args.total_episodes, seed=seed, lr=args.lr, epsilon=args.epsilon, activate_at=args.activate, paths=args.paths, algorithm=args.algorithm, policy=args.policy, ncpu=ncpu, nsteps=args.nsteps, nminibatches=args.nminibatches, ent_coef=args. ent_coef, # default 0.01 in baselines, 0.0001 in chainer A3C )
from baselines.siggraph_script.training_utils import * import gym from baselines import logger import numpy as np if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHumanWalker-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_' + args.env + str(args.seed) + '_run') env = gym.make(args.env) env.env.assist_timeout = 100.0 env.env.target_vel = 5.0 env.env.init_tv = 0.0 env.env.final_tv = 5.0 env.env.tv_endtime = 3.0 env.env.energy_weight = 0.15 env.env.alive_bonus_rew = 9.0 train_mirror_sig(env, num_timesteps=int(5000000), seed=args.seed, obs_perm=np.array( [0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9, 10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, \ 28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36, 37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58,
def main(): # enforce_env_name = 'SimplerPathFinding-v0' enforce_env_name = 'DartEel-v0' num_processes = MPI.COMM_WORLD.Get_size() num_timesteps_per_process = 1000 num_iterations_enforce = 1000 num_iterations_release = 100 release_env_name = 'SimplerPathFinding-v1' # for i in range(1): i = 0 seed = i * 13 + 7 * (i**2) # seed = 128 import baselines.common.tf_util as U import datetime import time ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') tf.reset_default_graph() with U.single_threaded_session() as sess: logger.reset() logger.configure('../data/ppo_' + enforce_env_name + '_baseline_seed=' + str(seed) + '/' + str(st)) model = train(sess, enforce_env_name, num_timesteps=num_iterations_enforce * num_processes * num_timesteps_per_process, timesteps_per_actor=num_timesteps_per_process, seed=seed) # logger.reset() # logger.configure('data/ppo_PathFinding-v0_release/baseline_seed=' + str(seed)) # with tf.variable_scope(tf.get_variable_scope(), reuse=True): # model = train(sess, release_env_name, # num_timesteps=num_iterations_release * num_processes * num_timesteps_per_process, # timesteps_per_actor=num_timesteps_per_process, seed=seed) comm = MPI.COMM_WORLD mpi_rank = comm.Get_rank() if mpi_rank == 0: env = gym.make(enforce_env_name) env = wrappers.Monitor(env, logger.get_dir() + '/results', force=True) obs = env.reset() step = 0 while (True): env.render() actions = model._act(True, obs) obs, _, done, _ = env.step(actions[0][0]) env.render() if done: obs = env.reset() if done: print("Visualization is Done") break step += 1
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='SunblazeBreakout-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--output', type=str) parser.add_argument('--processes', default=1, help='int or "max" for all') # parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--total-episodes', type=int, default=int(5e4)) parser.add_argument('--policy', help='Policy architecture', choices=['mlp', 'lstm'], default='mlp') # Hyperparameters parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--nsteps', type=int, default=2048) parser.add_argument('--nminibatches', type=int, default=32) args = parser.parse_args() # Configure logger if args.output: try: os.makedirs(args.output) except OSError: pass logger.reset() logger.configure(dir=args.output) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.output, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) if args.processes == 'max': ncpu = multiprocessing.cpu_count() # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15 if sys.platform == 'darwin': ncpu //= 2 else: try: ncpu = int(args.processes) except ValueError: raise argparse.ArgumentTypeError("Invalid number of processes") train( args.env, total_episodes=args.total_episodes, seed=seed, ncpu=ncpu, policy=args.policy, lr=args.lr, nsteps=args.nsteps, nminibatches=args.nminibatches, )
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHopperPT-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--name', help='name of experiments', type=str, default="") parser.add_argument('--max_step', help='maximum step size', type=int, default=1000000) parser.add_argument('--batch_size', help='batch size', type=int, default=4000) parser.add_argument('--clip', help='clip', type=float, default=0.2) parser.add_argument('--schedule', help='schedule', default='constant') parser.add_argument('--train_up', help='whether train up', default='True') parser.add_argument('--dyn_params', action='append', type=int) parser.add_argument('--output_interval', help='interval of outputting policies', type=int, default=10) parser.add_argument( '--mirror', help= 'whether to use mirror, (0: not mirror, 1: hard mirror, 2: soft mirror)', type=int, default=0) parser.add_argument('--warmstart', help='path to warmstart policies', type=str, default="") args = parser.parse_args() global output_interval output_interval = args.output_interval logger.reset() config_name = 'data/ppo_' + args.env + str(args.seed) + '_' + args.name if args.mirror == 1: config_name += '_mirror' elif args.mirror == 2: config_name += '_softmirror' if len(args.warmstart) > 0: config_name += '_warmstart' if args.train_up == 'True': config_name += '_UP' logger.configure(config_name, ['json', 'stdout']) train(args.env, num_timesteps=int(args.max_step), seed=args.seed, batch_size=args.batch_size, clip=args.clip, schedule=args.schedule, mirror=args.mirror, warmstart=args.warmstart, train_up=args.train_up == 'True', dyn_params=args.dyn_params)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='SunblazeCartPole-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--output', type=str) # parser.add_argument('--episodes-per-trial', type=int, default=5) # parser.add_argument('--trials', type=int, default=10 ** 4) # The total number of episodes is now trials*episodes_per_trial parser.add_argument('--total-episodes', type=int, default=5e4) parser.add_argument('--policy', help='Policy architecture', choices=['mlp', 'lstm'], default='mlp') parser.add_argument('--processes', default=1, help='int or "max" for all') parser.add_argument('--reward-scale', type=float, default=1.0) # Hyperparameters parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--nsteps', type=int, default=5) parser.add_argument('--ent-coef', type=float, default=1e-2) args = parser.parse_args() #total_episodes = args.trials * args.episodes_per_trial # Configure logger if args.output: try: os.makedirs(args.output) except OSError: pass logger.reset() logger.configure(dir=args.output) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.output, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) if args.processes == 'max': ncpu = multiprocessing.cpu_count() # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15 if sys.platform == 'darwin': ncpu //= 2 else: try: ncpu = int(args.processes) except ValueError: raise argparse.ArgumentTypeError("Invalid number of processes") train( args.env, total_episodes=args.total_episodes, policy=args.policy, lr=args.lr, num_processes=ncpu, rew_scale=args.reward_scale, seed=seed, nsteps=args.nsteps, ent_coef=args. ent_coef, # default 0.01 in baselines, 0.0001 in chainer A3C )
def main(): import argparse parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartDogRobot-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--init_policy', help='Initial Policy', default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl') parser.add_argument('--init_curriculum', help='Initial Curriculum', nargs='+', default=[2000.0, 2000]) parser.add_argument('--ref_policy', help='Reference Policy', default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl') parser.add_argument('--ref_curriculum', help='Reference Curriculum', nargs='+', default=[2000.0, 2000]) parser.add_argument('--anc_thres', help='Anchor Threshold', type=float, default=0.85) parser.add_argument('--prog_thres', help='Progress Threshold', type=float, default=0.7) parser.add_argument('--batch_size', help='Batch Size', type=int, default=2500) parser.add_argument('--max_iter', help='Maximum Iteration', type=int, default=2000) parser.add_argument('--use_reftraj', help='Use reference trajectory', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_curriculum_150eachit_vel8_mirror4_runningavg3_2s_torque13x_e1' + args.env + '_' + str( args.seed) + '_' + str(args.anc_thres) + '_' + str(args.prog_thres) + '_' + str(args.batch_size)) sess = U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) ob_space = env.observation_space ac_space = env.action_space def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array( [0.0001, -1, 2, -3, -4, 9, 10, 11, 12, 5, 6, 7, 8, 17, 18, 19, 20, 13, 14, 15, 16, 21, 22, -23, 24, -25, -26, 31, 32, 33, 34, 27, 28, 29, 30, 39, 40, 41, 42, 35, 36, 37, 38, 44, 43, 46, 45, 47]), action_permutation=np.array( [4, 5, 6, 7, 0.0001, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11])) policy = policy_fn('policy', ob_space, ac_space) init_curriculum = np.array(args.init_curriculum) ref_policy = policy_fn('ref_policy', ob_space, ac_space) ref_curriculum = np.array(args.ref_curriculum) policy_params = joblib.load(args.init_policy) ref_policy_params = joblib.load(args.ref_policy) U.initialize() cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].name.find('/')] orig_scope = list(policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')] ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params.keys())[0].find('/')] for i in range(len(policy.get_variables())): assign_op = policy.get_variables()[i].assign( policy_params[policy.get_variables()[i].name.replace(cur_scope, orig_scope, 1)]) sess.run(assign_op) assign_op = ref_policy.get_variables()[i].assign( ref_policy_params[ref_policy.get_variables()[i].name.replace('ref_' + cur_scope, ref_scope, 1)]) sess.run(assign_op) anchor_threshold = args.anc_thres progress_threshold = args.prog_thres env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) curriculum_evolution = [] env.env.env.anchor_kp = ref_curriculum ref_score = None ref_max_score = None reference_trajectory = None # if MPI.COMM_WORLD.Get_rank() == 0: if args.use_reftraj == 1: reference_trajecotry = gen_reftraj(env, ref_policy, 299) env.env.reference_trajectory = reference_trajectory ref_score, ref_max_score = evaluate_policy(env, ref_policy, 24) ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0) ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0) reference_score = ref_score * progress_threshold reference_anchor_score = ref_score * anchor_threshold reference_max_score = ref_max_score * 0.9 env.env.env.anchor_kp = init_curriculum reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory current_curriculum = np.copy(init_curriculum) print('reference scores: ', reference_score, reference_anchor_score, reference_max_score) #env.env.env.energy_weight *= 0.5 # env.env.env.final_tv -= 0.5 previous_params = policy_params for iter in range(args.max_iter): print('curriculum iter ', iter) print('ref score: ', reference_anchor_score) opt_pi, final_rew = pposgd_mirror.learn(env, policy_fn, max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150, timesteps_per_batch=int(args.batch_size), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, return_threshold=reference_anchor_score, init_policy_params=previous_params, policy_scope='pi' + str(iter), min_iters=0, reward_drop_bound=True, # max_threshold = reference_max_score, ) print('one learning iteration done') if np.linalg.norm(current_curriculum) >= 0.0001: # re-compute reference trajectory if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1: print('recompute reference traj') reference_trajecotry = gen_reftraj(env, opt_pi, 299) reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory if final_rew < reference_anchor_score * 0.95: print('update reference scores') reference_score = reference_score / reference_anchor_score * final_rew reference_anchor_score = final_rew closest_candidate = None # if MPI.COMM_WORLD.Get_rank() == 0: directions = [np.array([-1, 0]), np.array([0, -1]), -current_curriculum / np.linalg.norm(current_curriculum)] int_d1 = directions[0] + directions[2] int_d2 = directions[1] + directions[2] directions.append(int_d1 / np.linalg.norm(int_d1)) directions.append(int_d2 / np.linalg.norm(int_d2)) # directions = [np.array([0.0, -1.0])] # only search in one direction candidate_next_anchors = [] for direction in directions: found_point, perf = binary_search_curriculum(env, opt_pi, current_curriculum, direction, reference_score, reference_max_score, 6) print(direction, found_point, perf) candidate_next_anchors.append(found_point) if closest_candidate is None: closest_candidate = np.copy(found_point) elif np.linalg.norm(closest_candidate) > np.linalg.norm(found_point): closest_candidate = np.copy(found_point) if np.linalg.norm(closest_candidate) < 0.5: closest_candidate = np.array([0, 0]) if np.abs(closest_candidate[0]) < 0.1: closest_candidate[0] = 0.0 if np.abs(closest_candidate[1]) < 0.1: closest_candidate[1] = 0.0 # closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0) current_curriculum = np.copy(closest_candidate) env.env.env.anchor_kp = current_curriculum '''print('Update Init Pose Distributions') update_init_poses(env, opt_pi) if MPI.COMM_WORLD.Get_rank() == 0: joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True) joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)''' curriculum_evolution.append(current_curriculum) print('Current curriculum: ', current_curriculum) opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val if np.linalg.norm(current_curriculum) < 0.0001: if reference_anchor_score < ref_score: reference_anchor_score = ref_score else: break env.close()
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHopper-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--name', help='name of experiments', type=str, default="") parser.add_argument('--max_iter', help='maximum iteration number', type=int, default=1000) parser.add_argument('--inner_iter', help='inner iteration number', type=int, default=30) parser.add_argument('--output_interval', help='interval of outputting policies', type=int, default=100) parser.add_argument('--warmstart', help='warmstart of experiments', type=str, default="") parser.add_argument('--skilldim', help='dimension of latent variable', type=int, default=2) parser.add_argument('--task_number', help='number of tasks to sample per iteration', type=int, default=5) parser.add_argument('--mirror', help='use mirror policy', default="False") parser.add_argument('--dyn_params', action='append', type=int) args = parser.parse_args() global config_name, output_interval output_interval = args.output_interval logger.reset() config_name = 'data/mso_ars_' + args.env + str(args.seed) + '_' + args.name config_name += '_skilldim' + str(args.skilldim) config_name += '_maxiter' + str(args.max_iter) config_name += '_tasknum' + str(args.task_number) config_name += '_inneriter' + str(args.inner_iter) if len(args.warmstart) > 0: config_name += '_warmstart' if args.mirror == 'True': config_name += '_mirror' logger.configure(config_name, ['json', 'stdout']) train(args.env, skilldim=args.skilldim, max_iter=int(args.max_iter), inner_iter=int(args.inner_iter), seed=args.seed, tasknum=int(args.task_number), warmstart=args.warmstart, mirror=args.mirror == 'True', dyn_params=args.dyn_params)