def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) args = parser.parse_args() logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args.update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args.update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args.proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus = args.dynamics_bonus ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): parser = arg_parser() parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--max_episode_steps', type=int, default=4500) parser.add_argument('--num-timesteps', type=int, default=int(1e8)) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0.) parser.add_argument('--beta', type=float, default=1e-3) parser.add_argument('--exploration_type', type=str, default='bottleneck') parser.add_argument('--noise_type', type=str, default='none', choices=['none', 'box']) parser.add_argument('--noise_p', type=float, default=0.1) parser.add_argument('--use_sched', type=int, default=0) parser.add_argument('--exp_name', type=str, default='none') args = parser.parse_args() if args.policy == 'rnn': args.gamma_ext = 0.999 else: args.gamma_ext = 0.99 logger_dir = './results/' + args.env.replace("NoFrameskip-v4", "") logger_dir += datetime.datetime.now().strftime("-%m-%d-%H-%M-%S") logger.configure(dir=logger_dir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, exploration_type=args.exploration_type, beta=args.beta, noise_type=args.noise_type, noise_p=args.noise_p, use_sched=args.use_sched, exp_name=args.exp_name, ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def common_arg_parser(): """ Create an argparse.ArgumentParser for run_mujoco.py. """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument( '--env_type', help= 'type of environment, used when the environment type cannot be automatically determined', type=str) parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2') parser.add_argument('--num_timesteps', type=float, default=1e6), parser.add_argument( '--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None) parser.add_argument( '--gamestate', help='game state to load (so far only used in retro games)', default=None) parser.add_argument( '--num_env', help= 'Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=1, type=int) parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float) parser.add_argument('--save_path', help='Path to save trained model to', default='../../results/PPO/try_1/Random_start/', type=str) parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int) parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int) parser.add_argument('--play', default=False, action='store_true') parser.add_argument('--nsteps', default=2048, type=int) parser.add_argument('--size', default=30, type=int) parser.add_argument('--n_action', default=4, type=int) parser.add_argument('--n_agent', default=2, type=int) parser.add_argument('--episode_length', default=300, type=int) parser.add_argument('--doi', default=8, type=int, help='door_open_interval') parser.add_argument('--penalty', default=0.0, type=float) parser.add_argument('--gamma_dec', default=0.0, type=float) parser.add_argument('--gamma_cen', default=0.0, type=float) parser.add_argument('--fix_start', default=False, action='store_true') parser.add_argument('--gamma_coor_r', default=0.0, type=float) parser.add_argument('--gamma_coor_t', default=0.0, type=float) parser.add_argument('--gamma_coor_tv', default=0.0, type=float) parser.add_argument('--symmetry', default=False, action='store_true') parser.add_argument('--simple_env', default=False, action='store_true') parser.add_argument('--r', default=False, action='store_true') parser.add_argument('--t', default=False, action='store_true') parser.add_argument('--tv', default=False, action='store_true') parser.add_argument('--r_tv', default=False, action='store_true') parser.add_argument('--env_n_dim', default=2, type=int) parser.add_argument('--t_save_rate', default=1, type=int) parser.add_argument('--s_data_gather', default=False, action='store_true') parser.add_argument('--s_data_path', default='/data1/wjh/code/results/data/', type=str) parser.add_argument('--s_try_num', default=0, type=int) parser.add_argument('--s_alg_name', default='', type=str) parser.add_argument('--s_load_num', default='', type=str) parser.add_argument('--island_partial_obs', default=False, action='store_true') parser.add_argument('--island_agent_max_power', default=11, type=int) parser.add_argument('--island_wolf_max_power', default=9, type=int) parser.add_argument('--island_wolf_recover_time', default=5, type=int) parser.add_argument('--i_num_landmark', default=2, type=int) # parser.add_argument('--x_island_agent_max_power', default=11, type=int) # parser.add_argument('--x_island_wolf_max_power', default=10, type=int) parser.add_argument('--x_island_agent_max_power', default=51, type=int) parser.add_argument('--x_island_wolf_max_power', default=21, type=int) parser.add_argument('--x_island_wolf_recover_time', default=5, type=int) parser.add_argument('--x_island_harm_range', default=11, type=int) parser.add_argument('--x_num_landmark', default=2, type=int) parser.add_argument('--x_wolf_rew', default=600, type=int) parser.add_argument('--x_landmark_rew', default=10, type=int) parser.add_argument('--not_view_landmark', default=False, action='store_true') parser.add_argument('--appro_T', default=0.5, type=float) parser.add_argument('--max_episode_steps', type=int, default=4500) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='fc', choices=['fc']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) return parser
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument( "--num-timesteps", type=int, default=int(1e12), ) parser.add_argument( "--num_env", type=int, default=32, ) parser.add_argument( "--use_news", type=int, default=0, ) parser.add_argument( "--gamma", type=float, default=0.99, ) parser.add_argument( "--gamma_ext", type=float, default=0.999, ) parser.add_argument( "--lam", type=float, default=0.95, ) parser.add_argument( "--update_ob_stats_every_step", type=int, default=0, ) parser.add_argument( "--update_ob_stats_independently_per_gpu", type=int, default=0, ) parser.add_argument( "--update_ob_stats_from_random_agent", type=int, default=1, ) parser.add_argument( "--proportion_of_exp_used_for_predictor_update", type=float, default=1.0, ) parser.add_argument( "--tag", type=str, default="", ) parser.add_argument( "--policy", type=str, default="cnn", choices=["cnn", "rnn", "ffnn"], ) parser.add_argument( "--int_coeff", type=float, default=1.0, ) parser.add_argument( "--ext_coeff", type=float, default=2.0, ) parser.add_argument( "--dynamics_bonus", type=int, default=0, ) parser.add_argument( "--meta_rl", type=lambda x: True if x.lower() in {'true', 't'} else False, default=False, ) args = parser.parse_args() logger.configure( dir=logger.get_dir(), format_strs=["stdout", "log", "csv"] if MPI.COMM_WORLD.Get_rank() == 0 else [], ) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), "experiment_tag.txt"), "w") as f: f.write(args.tag) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, meta_rl=args.meta_rl, ) tf_util.make_session(make_default=True) train( env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, )
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num_timesteps', type=float, default=100e6) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_div', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=1) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_updated', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--save_dir', help="dir to save and log", type=str, default="save_dir") parser.add_argument('--load_path', help="dir to load model", type=str, default=None) parser.add_argument('--base_load_path', help="dir to load model", type=str, default=None) parser.add_argument('--r_path', help="dir to load r network", type=str, default=None) parser.add_argument('--play', default=False, action='store_true') parser.add_argument('--only_train_r', default=False, action='store_true') parser.add_argument('--online_train_r', default=False, action='store_true') #parser.add_argument('--ec_type', type=str, default='episodic_curiosity', choices=['episodic_curiosity', 'none','oracle']) parser.add_argument('--rnd_type', type=str, default='rnd', choices=['rnd', 'oracle']) parser.add_argument('--reset', default=False, action='store_true') parser.add_argument('--dynamics_sample', default=False, action='store_true') parser.add_argument('--num_agents', type=int, default=1) parser.add_argument('--div_type', type=str, default='oracle', choices=['oracle', 'cls', 'rnd']) parser.add_argument('--load_ram', default=False, action='store_true') parser.add_argument('--debug', default=False, action='store_true') parser.add_argument('--rnd_mask_prob', type=float, default=1.) parser.add_argument('--rnd_mask_type', type=str, default='indep', choices=['prog', 'indep', 'shared']) parser.add_argument('--indep_rnd', default=False, action='store_true') parser.add_argument('--indep_policy', default=True, action='store_true') parser.add_argument('--sd_type', type=str, default='oracle', choices=['oracle', 'sd']) parser.add_argument('--from_scratch', default=False, action='store_true') parser.add_argument('--kl', default=False, action='store_true') args = parser.parse_args() log_path = os.path.join(args.save_dir, 'logs') save_path = os.path.join(args.save_dir, 'models') logger.configure(dir=log_path, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, gamma_div=args.gamma_div, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_updated, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, log_interval=10, save_path=save_path, load_path=args.load_path, r_path=args.r_path, play=args.play, only_train_r=args.only_train_r, online_train_r=args.online_train_r, #ec_type = args.ec_type, rnd_type=args.rnd_type, reset=args.reset, dynamics_sample=args.dynamics_sample, num_agents=args.num_agents, div_type=args.div_type, load_ram=args.load_ram, debug=args.debug, rnd_mask_prob=args.rnd_mask_prob, rnd_mask_type=args.rnd_mask_type, indep_rnd=args.indep_rnd, indep_policy=args.indep_policy, sd_type=args.sd_type, from_scratch=args.from_scratch, base_load_path=args.base_load_path, use_kl=args.kl) if args.play: args.num_env = 1 tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): default_log_dir = "/tmp/rnd_log" parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(4.2e7)) # 10k parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) # parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=0.25) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--logdir', type=str, default=default_log_dir) parser.add_argument('--action_balance_coef', '--abc', type=float, default=None) parser.add_argument('--array_action', type=int, default=1) parser.add_argument('--num_minibatches', type=int, default=4) args = parser.parse_args() if args.logdir != default_log_dir and os.path.isdir( args.logdir) and os.listdir(args.logdir): raise ValueError("logdir not empty!") logger.configure(dir=args.logdir, format_strs=['stdout', 'log', 'csv', 'tensorboard'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=args.num_minibatches, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, action_balance_coef=args.action_balance_coef, array_action=args.array_action) logger.info('args: {}'.format(args)) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument( '--save_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument( '--load_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument('--test', type=int, default=0) parser.add_argument('--save_image', type=int, default=0) parser.add_argument('--exp_name', type=str, default='tmp') parser.add_argument('--logdir', type=str, default='./logs/') parser.add_argument('--clip_rewards', type=int, default=1) parser.add_argument('--e_greedy', type=int, default=0) parser.add_argument('--action_space', type=str, default='RIGHT_ONLY') parser.add_argument('--load_mtype', type=str, default='latest') args = parser.parse_args() logdir = os.path.join( args.logdir, args.exp_name + '_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) logger.configure(folder=logdir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, load_dir=args.load_dir, save_dir=args.save_dir, test=args.test, exp_name=args.exp_name, clip_rewards=args.clip_rewards, save_image=args.save_image, action_space=args.action_space, e_greedy=args.e_greedy, load_mtype=args.load_mtype)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) parser.add_argument('--num_env', type=int, default=16) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--clear-run', action='store_true', default=False, help='if clear the save folder') parser.add_argument('--mega-wrapper', type=int, default=0, help='if use the same wrapper as mega') args = parser.parse_args() args.save_dir = '../rnd_results/' args.save_dir = os.path.join(args.save_dir, 'e_n-{}/'.format(args.env)) args.save_dir = os.path.join( args.save_dir, 'mega_wrapper-{}'.format(str(args.mega_wrapper))) args.save_dir = os.path.join(args.save_dir, 'num_env-{}'.format(str(args.num_env))) args.save_dir = os.path.join(args.save_dir, 'int_coeff-{}'.format(str(args.int_coeff))) if args.clear_run: '''if clear_run, clear the path before create the path''' input('You have set clear_run, is that what you want?') subprocess.call(["rm", "-r", args.save_dir]) try: os.makedirs(args.save_dir) except Exception as e: print('file exists') try: os.makedirs('../rnd_log_results/' + args.env + '/') except Exception as e: print('log file exists') args.summary_writer = tf.summary.FileWriter(args.save_dir) logger.configure(dir='../rnd_log_results/' + args.env + '/', format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, args=args)