def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) args = parser.parse_args() logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args.update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args.update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args.proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus = args.dynamics_bonus ) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument( "--num-timesteps", type=int, default=int(1e12), ) parser.add_argument( "--num_env", type=int, default=32, ) parser.add_argument( "--use_news", type=int, default=0, ) parser.add_argument( "--gamma", type=float, default=0.99, ) parser.add_argument( "--gamma_ext", type=float, default=0.999, ) parser.add_argument( "--lam", type=float, default=0.95, ) parser.add_argument( "--update_ob_stats_every_step", type=int, default=0, ) parser.add_argument( "--update_ob_stats_independently_per_gpu", type=int, default=0, ) parser.add_argument( "--update_ob_stats_from_random_agent", type=int, default=1, ) parser.add_argument( "--proportion_of_exp_used_for_predictor_update", type=float, default=1.0, ) parser.add_argument( "--tag", type=str, default="", ) parser.add_argument( "--policy", type=str, default="cnn", choices=["cnn", "rnn", "ffnn"], ) parser.add_argument( "--int_coeff", type=float, default=1.0, ) parser.add_argument( "--ext_coeff", type=float, default=2.0, ) parser.add_argument( "--dynamics_bonus", type=int, default=0, ) parser.add_argument( "--meta_rl", type=lambda x: True if x.lower() in {'true', 't'} else False, default=False, ) args = parser.parse_args() logger.configure( dir=logger.get_dir(), format_strs=["stdout", "log", "csv"] if MPI.COMM_WORLD.Get_rank() == 0 else [], ) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), "experiment_tag.txt"), "w") as f: f.write(args.tag) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, meta_rl=args.meta_rl, ) tf_util.make_session(make_default=True) train( env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, )
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num_timesteps', type=float, default=100e6) parser.add_argument('--num_env', type=int, default=128) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_div', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=1) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_updated', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--save_dir', help="dir to save and log", type=str, default="save_dir") parser.add_argument('--load_path', help="dir to load model", type=str, default=None) parser.add_argument('--base_load_path', help="dir to load model", type=str, default=None) parser.add_argument('--r_path', help="dir to load r network", type=str, default=None) parser.add_argument('--play', default=False, action='store_true') parser.add_argument('--only_train_r', default=False, action='store_true') parser.add_argument('--online_train_r', default=False, action='store_true') #parser.add_argument('--ec_type', type=str, default='episodic_curiosity', choices=['episodic_curiosity', 'none','oracle']) parser.add_argument('--rnd_type', type=str, default='rnd', choices=['rnd', 'oracle']) parser.add_argument('--reset', default=False, action='store_true') parser.add_argument('--dynamics_sample', default=False, action='store_true') parser.add_argument('--num_agents', type=int, default=1) parser.add_argument('--div_type', type=str, default='oracle', choices=['oracle', 'cls', 'rnd']) parser.add_argument('--load_ram', default=False, action='store_true') parser.add_argument('--debug', default=False, action='store_true') parser.add_argument('--rnd_mask_prob', type=float, default=1.) parser.add_argument('--rnd_mask_type', type=str, default='indep', choices=['prog', 'indep', 'shared']) parser.add_argument('--indep_rnd', default=False, action='store_true') parser.add_argument('--indep_policy', default=True, action='store_true') parser.add_argument('--sd_type', type=str, default='oracle', choices=['oracle', 'sd']) parser.add_argument('--from_scratch', default=False, action='store_true') parser.add_argument('--kl', default=False, action='store_true') args = parser.parse_args() log_path = os.path.join(args.save_dir, 'logs') save_path = os.path.join(args.save_dir, 'models') logger.configure(dir=log_path, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict( frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, gamma_div=args.gamma_div, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_updated, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, log_interval=10, save_path=save_path, load_path=args.load_path, r_path=args.r_path, play=args.play, only_train_r=args.only_train_r, online_train_r=args.online_train_r, #ec_type = args.ec_type, rnd_type=args.rnd_type, reset=args.reset, dynamics_sample=args.dynamics_sample, num_agents=args.num_agents, div_type=args.div_type, load_ram=args.load_ram, debug=args.debug, rnd_mask_prob=args.rnd_mask_prob, rnd_mask_type=args.rnd_mask_type, indep_rnd=args.indep_rnd, indep_policy=args.indep_policy, sd_type=args.sd_type, from_scratch=args.from_scratch, base_load_path=args.base_load_path, use_kl=args.kl) if args.play: args.num_env = 1 tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): default_log_dir = "/tmp/rnd_log" parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(4.2e7)) # 10k parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) # parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=0.25) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--logdir', type=str, default=default_log_dir) parser.add_argument('--action_balance_coef', '--abc', type=float, default=None) parser.add_argument('--array_action', type=int, default=1) parser.add_argument('--num_minibatches', type=int, default=4) args = parser.parse_args() if args.logdir != default_log_dir and os.path.isdir( args.logdir) and os.listdir(args.logdir): raise ValueError("logdir not empty!") logger.configure(dir=args.logdir, format_strs=['stdout', 'log', 'csv', 'tensorboard'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=args.num_minibatches, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus, action_balance_coef=args.action_balance_coef, array_action=args.array_action) logger.info('args: {}'.format(args)) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=32) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument( '--save_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument( '--load_dir', type=str, default='/home/hxu/PriorRL/random-network-distillation/ckpts/') parser.add_argument('--test', type=int, default=0) parser.add_argument('--save_image', type=int, default=0) parser.add_argument('--exp_name', type=str, default='tmp') parser.add_argument('--logdir', type=str, default='./logs/') parser.add_argument('--clip_rewards', type=int, default=1) parser.add_argument('--e_greedy', type=int, default=0) parser.add_argument('--action_space', type=str, default='RIGHT_ONLY') parser.add_argument('--load_mtype', type=str, default='latest') args = parser.parse_args() logdir = os.path.join( args.logdir, args.exp_name + '_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) logger.configure(folder=logdir, format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, load_dir=args.load_dir, save_dir=args.save_dir, test=args.test, exp_name=args.exp_name, clip_rewards=args.clip_rewards, save_image=args.save_image, action_space=args.action_space, e_greedy=args.e_greedy, load_mtype=args.load_mtype)
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) parser.add_argument('--num_env', type=int, default=16) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--clear-run', action='store_true', default=False, help='if clear the save folder') parser.add_argument('--mega-wrapper', type=int, default=0, help='if use the same wrapper as mega') args = parser.parse_args() args.save_dir = '../rnd_results/' args.save_dir = os.path.join(args.save_dir, 'e_n-{}/'.format(args.env)) args.save_dir = os.path.join( args.save_dir, 'mega_wrapper-{}'.format(str(args.mega_wrapper))) args.save_dir = os.path.join(args.save_dir, 'num_env-{}'.format(str(args.num_env))) args.save_dir = os.path.join(args.save_dir, 'int_coeff-{}'.format(str(args.int_coeff))) if args.clear_run: '''if clear_run, clear the path before create the path''' input('You have set clear_run, is that what you want?') subprocess.call(["rm", "-r", args.save_dir]) try: os.makedirs(args.save_dir) except Exception as e: print('file exists') try: os.makedirs('../rnd_log_results/' + args.env + '/') except Exception as e: print('log file exists') args.summary_writer = tf.summary.FileWriter(args.save_dir) logger.configure(dir='../rnd_log_results/' + args.env + '/', format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, args=args)