def main(): args = setup_utils.setup_and_load() setup_utils.load_for_setup_if_necessary() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes=" baseline train", tags=["baseline", Config.RUN_ID.split('-')[0]], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu') utils.mpi_print(args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs is how many envs run parallel on a cpu # VenEnv class allows parallel rollout nenvs = Config.NUM_ENVS total_timesteps = int(256 * 10**6) env = utils.make_general_env(nenvs, seed=rank) utils.mpi_print('Set up env') with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies_back.get_policy() #policy = policies.get_policy() utils.mpi_print('Set up policy') learn_func(policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes="network randomization", tags=["baseline"], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(256e6) env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = nr_policies.get_policy() nr_ppo2.learn(policy=policy, env=env, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def dump_model(self): #utils.save_params_in_scopes(self.sess, [self.scope_dir + "model"], Config.get_save_file()) data_dict = {} save_path = utils.file_to_path(Config.get_save_file()) data_dict['args'] = Config.get_args_dict() data_dict['args']['use_minimum_model'] = True param_dict = {} if len(self.params) > 0: #print('saving scope', scope, filename) ps = self.sess.run(self.params) param_dict["model"] = ps data_dict['params'] = param_dict joblib.dump(data_dict, save_path)
def main(): # load from restore file args_dict = utils.load_args() # train args of restore id test_args = setup_utils.setup_and_load() if 'NR' in Config.RESTORE_ID: Config.USE_LSTM = 2 if 'dropout' in Config.RESTORE_ID: Config.DROPOUT = 0 Config.USE_BATCH_NORM = 0 wandb.init(project="coinrun", notes="test", tags=["baseline", "test"], config=Config.get_args_dict()) config = tf.ConfigProto() config.gpu_options.allow_growth = True seed = np.random.randint(100000) Config.SET_SEED = seed overlap = { 'set_seed': Config.SET_SEED, 'rep': Config.REP, 'highd': Config.HIGH_DIFFICULTY, 'num_levels': Config.NUM_LEVELS, 'use_lstm': Config.USE_LSTM, 'dropout': Config.DROPOUT, 'use_batch_norm': Config.USE_BATCH_NORM } load_file = Config.get_load_filename(restore_id=Config.RESTORE_ID) mpi_print('load file name', load_file) mpi_print('seed', seed) mpi_print("---------------------------------------") for checkpoint in range(1, 33): with tf.Session() as sess: steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) enjoy_env_sess(sess, checkpoint, overlap)
def save_params_in_scopes(sess, scopes, filename, base_dict=None): data_dict = {} if base_dict is not None: data_dict.update(base_dict) save_path = file_to_path(filename) data_dict['args'] = Config.get_args_dict() param_dict = {} for scope in scopes: params = tf.trainable_variables(scope) if len(params) > 0: print('saving scope', scope, filename) ps = sess.run(params) param_dict[scope] = ps data_dict['params'] = param_dict joblib.dump(data_dict, save_path)
def enjoy_env_sess(sess, checkpoint, overlap): #base_name = str(8*checkpoint) + 'M' #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name) should_eval = True mpi_print('test levels seed', Config.SET_SEED) mpi_print('test levels ', Config.NUM_LEVELS) rep_count = 50 env = utils.make_general_env(20) env = wrappers.add_final_wrappers(env) nenvs = env.num_envs sess.run(tf.global_variables_initializer()) args_now = Config.get_args_dict() #args_run = utils.load_args() agent = create_act_model(sess, env, nenvs) # load name is specified by config.RESTORE_ID adn return True/False if checkpoint != 32: base_name = str(8 * checkpoint) + 'M' elif checkpoint == 0: mean_score = 0.0 succ_rate = 0.0 wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) return mean_score, succ_rate else: base_name = None sess.run(tf.global_variables_initializer()) # env init here load_file = setup_utils.restore_file(Config.RESTORE_ID, overlap_config=overlap, base_name=base_name) is_loaded = utils.load_params_for_scope(sess, 'model') if not is_loaded: mpi_print('NO SAVED PARAMS LOADED') return mean_score, succ_rate obs = env.reset() t_step = 0 scores = np.zeros((nenvs, rep_count)) eplens = np.zeros((nenvs, rep_count)) #scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) # curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) def rollout(obs, state, done): """rollout for rep * nenv times and return scores""" t = 0 count = 0 rews = np.zeros((nenvs, rep_count)) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) rews[:, count] += rew t += 1 for i, d in enumerate(done): if d: eplens[i][count] = t if score_counts[i] < rep_count: score_counts[i] += 1 count = score_counts[i] - 1 # aux score if 'episode' in info[i]: scores[i][count] = info[i].get('episode')['r'] return scores, rews, eplens if is_loaded: mpi_print(load_file) scores, rews, eplens = rollout(obs, state, done) size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() if size == 1: if rank == 0: testset_size = rep_count * nenvs utils.save_pickle(scores, Config.LOGDIR + 'scores') mean_score = np.sum(scores) / testset_size succ_rate = np.sum(scores == 10.0) / testset_size mpi_print('cpus ', size) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) mpi_print('mean score', mean_score) wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) #mpi_print('mean score of each env',[np.mean(s) for s in scores]) else: testset_size = rep_count * nenvs succ = np.sum(scores=10.0) / testset_size succ_rate = utils.mpi_average([succ]) mean_score_tmp = np.sum(scores) / testset_size mean_score = utils.mpi_average([mean_score_tmp]) if rank == 0: mpi_print('testset size', rep_count * nenvs * size) mpi_print('load file name', load_file) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) mpi_print('mean score', mean_score) wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate}) return mean_score, succ_rate