def generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782): ppo_graph = tf.Graph() print('Created graph') observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_id), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space) venv_eval = VecExtractDictObs(venv_eval, "rgb") venv_eval = VecMonitor( venv=venv_eval, filename=None, keep_buf=100, ) venv_eval = VecNormalize(venv=venv_eval, ob=False) venv_eval = wrappers.add_final_wrappers(venv_eval) print('Created env') graph_one_vars = ppo_graph.get_all_collection_keys() model_path = wandb_save_dir+'/%d/ppo-1'%mdp_id with tf.compat.v1.Session(graph=ppo_graph,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1: with tf.compat.v1.variable_scope("model_%d"%np.random.randint(0,100000,1).item()): ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') initialize = tf.compat.v1.global_variables_initializer() sess_1.run(initialize) print('Inited session') model_saver = tf.train.import_meta_graph(model_path+'.meta') model_saver.restore(sess_1, save_path=model_path) print('Restored PPO') mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval,nsteps=nsteps_rollout, param_vals='pretrained') print('Collected level data') venv_eval.close() return mb_obs_1, mb_actions_1, mb_rewards_1
def make_env(steps_per_env): observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) if Config.FIRST_PHASE == 'exploration': # baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL) else: # baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL) if Config.SECOND_PHASE == 'exploration': # baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE) gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL) elif Config.SECOND_PHASE != "None": # baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE) gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL) else: baseline_vec_adapt = gym3_env_adapt = None venv_train = FakeEnv(gym3_env_train, observation_space, action_space) venv_train = VecExtractDictObs(venv_train, "rgb") if Config.SECOND_PHASE != "None": venv_adapt = FakeEnv(gym3_env_adapt, observation_space, action_space) venv_adapt = VecExtractDictObs(venv_adapt, "rgb") venv_train = VecMonitor( venv=venv_train, filename=None, keep_buf=100, ) if Config.SECOND_PHASE != "None": venv_adapt = VecMonitor( venv=venv_adapt, filename=None, keep_buf=100, ) venv_train = VecNormalize(venv=venv_train, ob=False) venv_train = wrappers.add_final_wrappers(venv_train) if Config.SECOND_PHASE != "None": venv_adapt = VecNormalize(venv=venv_adapt, ob=False) venv_adapt = wrappers.add_final_wrappers(venv_adapt) venv = wrappers.DistributionShiftWrapperVec(env_list=[venv_train, venv_adapt], steps_per_env=steps_per_env) else: venv = venv_train venv_adapt = venv_train = None venv.current_env_steps_left = steps_per_env return venv, venv_train, venv_adapt
def main(): args = setup_utils.setup_and_load() setup_utils.load_for_setup_if_necessary() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes=" baseline train", tags=["baseline", Config.RUN_ID.split('-')[0]], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu') utils.mpi_print(args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs is how many envs run parallel on a cpu # VenEnv class allows parallel rollout nenvs = Config.NUM_ENVS total_timesteps = int(256 * 10**6) env = utils.make_general_env(nenvs, seed=rank) utils.mpi_print('Set up env') with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies_back.get_policy() #policy = policies.get_policy() utils.mpi_print('Set up policy') learn_func(policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load(num_levels=250, starting_level=0, paint_vel_info=1, run_id='start0numlev250_256mts_dann_low', num_envs=32) comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() #config = tf.ConfigProto() frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options) nogpu_config = tf.ConfigProto(device_count={'GPU': 0}) #config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS print("Num envs: " + str(Config.NUM_ENVS)) total_timesteps = int(256e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=frac_gpu_config): #with tf.Session(config=nogpu_config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes="network randomization", tags=["baseline"], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(256e6) env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = nr_policies.get_policy() nr_ppo2.learn(policy=policy, env=env, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def create_saliency(model_idx, sess): graph = tf.get_default_graph() env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) agent = create_act_model(sess, env, 1) action_selector = tf.placeholder(tf.int32) gradient_saliency = saliency.GradientSaliency(graph, sess, agent.pd.logits[0][action_selector], agent.X) sess.run(tf.compat.v1.global_variables_initializer()) # setup_utils.restore_file(models[model_idx]) try: loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') except AssertionError as e: models[model_idx] = None return [None]*3 return agent, gradient_saliency, action_selector
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(200e6) elif Config.SHORT_TRAINING: total_timesteps = int(120e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f : f * Config.LEARNING_RATE, cliprange=lambda f : f * 0.2, total_timesteps=total_timesteps)
def test(sess, load_path, env, should_render=False, rep_count=Config.REP): rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL if should_eval: #env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs model = load_model(sess, filename) agent = create_act_model(sess, env, nenvs) sess.run(tf.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] if t_step % 100 == 0: mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0], np.shape(obs)) maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = { 'steps_elapsed': steps_elapsed, } if should_eval: testset_size = rep_count * nenvs mean_score = np.sum(scores) / testset_size succ_rate = np.sum(scores == 10.0) / testset_size max_idx = np.argmax(scores) mpi_print('max idx', max_idx) mpi_print('steps_elapsed', steps_elapsed) if size > 1: mean_score = utils.mpi_average([mean_score]) mpi_print('mpi_mean', mpi_mean_score) wandb.log({'Test_Rew_mean': mean_score, 'Test_Succ_rate': succ_rate}) result['scores'] = scores result['testset_size'] = testset_size result['test_rew_mean'] = mean_score result['test_succ_rate'] = succ_rate return result
def enjoy_env_sess(sess): should_render = True should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL rep_count = Config.REP if should_eval: env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs agent = create_act_model(sess, env, nenvs) sess.run(tf.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] if t_step % 100 == 0: mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0], np.shape(obs)) maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = 0 if should_eval: mean_score = np.mean(scores) / rep_count max_idx = np.argmax(scores) mpi_print('scores', scores / rep_count) print('mean_score', mean_score) mpi_print('max idx', max_idx) mpi_mean_score = utils.mpi_average([mean_score]) mpi_print('mpi_mean', mpi_mean_score) result = mean_score return result
def enjoy_env_sess(sess, DIR_NAME): should_render = True should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL rep_count = Config.REP file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID) f_io = open(file_name, 'a') if should_eval: if Config.TEST_NUM_EVAL > -1: env = utils.make_general_env(Config.TEST_NUM_EVAL) else: env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs vae = ConvVAE(z_size=Config.VAE_Z_SIZE, batch_size=nenvs, is_training=False, reuse=False, gpu_mode=True, use_coord_conv=True) agent = create_act_model(sess, env, nenvs, Config.VAE_Z_SIZE) num_actions = env.action_space.n init_rand = tf.variables_initializer( [v for v in tf.global_variables() if 'randcnn' in v.name]) sess.run(tf.compat.v1.global_variables_initializer()) soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions], name='soft_numpy') dist = tfp.distributions.Categorical(probs=soft_numpy) sampled_action = dist.sample() loaded_params = utils.load_params_for_scope(sess, 'model') vae.load_json_full(Config.VAE_PATH) if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) actions = [env.action_space.sample() for _ in range(nenvs)] actions = np.array(actions) obs, _, _, _ = env.step(actions) sess.run(init_rand) while should_continue(): #scipy.misc.imsave('raw_inputs.png', obs[0]) encoder_in = obs.astype(np.float32) / 255.0 batch_z = vae.encode(encoder_in) #reconstruct = vae.decode(batch_z) #scipy.misc.imsave('recon.png', reconstruct[0]) action, values, state, _ = agent.step(batch_z, state, done) obs, rew, done, info = env.step(action) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = 0 if should_eval: mean_score = np.mean(scores) / rep_count max_idx = np.argmax(scores) result = mean_score f_io.write("{}\n".format(result)) f_io.close() return result
def main(sess): comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 if Config.EXTRACT_SEED != -1: seed = Config.EXTRACT_SEED if Config.EXTRACT_RANK != -1: rank = Config.EXTRACT_RANK set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 use_policy = (Config.RESTORE_ID != '') nenvs = Config.NUM_ENVS total_timesteps = int(502e6) env = utils.make_general_env(nenvs, seed=rank) if use_policy: agent = create_act_model(sess, env, nenvs) sess.run(tf.compat.v1.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') # make directory DIR_NAME = './VAE/records/' if not os.path.exists(DIR_NAME): os.makedirs(DIR_NAME, exist_ok=True) # set file name filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz" with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) obs[:] = env.reset() dones = [False for _ in range(nenv)] # remove noisy inputs actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) obs[:], rewards, dones, _ = env.step(actions) state = agent.initial_state mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[] # For n in range number of steps for _ in range(400): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init if use_policy: actions, _, _, _ = agent.step(obs, state, dones) else: actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) mb_obs.append(obs.copy()) mb_actions.append(actions) mb_dones.append(dones) # Take actions in env and look the results # Infos contains a ton of useful informations obs[:], rewards, dones, _ = env.step(actions) mb_next_obs.append(obs.copy()) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=obs.dtype) mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_dones = np.asarray(mb_dones, dtype=np.bool) #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones) np.savez_compressed(filename, obs=mb_obs) return filename
def learn(*, policy, env, eval_env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None): comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_size = comm.Get_size() #tf.compat.v1.disable_v2_behavior() sess = tf.compat.v1.get_default_session() if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) utils.load_all_params(sess) runner = Runner(env=env, eval_env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) eval_epinfobuf100 = deque(maxlen=100) tfirststart = time.time() active_ep_buf = epinfobuf100 eval_active_ep_buf = eval_epinfobuf100 nupdates = total_timesteps//nbatch mean_rewards = [] datapoints = [] run_t_total = 0 train_t_total = 0 can_save = False checkpoints = [32, 64] saved_key_checkpoints = [False] * len(checkpoints) if Config.SYNC_FROM_ROOT and rank != 0: can_save = False def save_model(base_name=None): base_dict = {'datapoints': datapoints} utils.save_params_in_scopes(sess, ['model'], Config.get_save_file(base_name=base_name), base_dict) # For logging purposes, allow restoring of update start_update = 0 if Config.RESTORE_STEP is not None: start_update = Config.RESTORE_STEP // nbatch z_iter = 0 curr_z = np.random.randint(0, high=Config.POLICY_NHEADS) tb_writer = TB_Writer(sess) import os os.environ["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1" os.environ["WANDB_SILENT"] = "true" run_id = np.random.randint(100000000) os.environ["WANDB_RUN_ID"] = str(run_id) group_name = "%s__%s__%f__%f" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP) name = "%s__%s__%f__%f__%d" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP, run_id) wandb.init(project='ising_generalization' if Config.ENVIRONMENT == 'ising' else 'procgen_generalization' , entity='ssl_rl', config=Config.args_dict, group=group_name, name=name, mode="disabled" if Config.DISABLE_WANDB else "online") api = wandb.Api() list_runs = api.runs("ssl_rl/procgen_generalization") single_level_runs=[run for run in list_runs if 'ppo_per_level' in run.name] non_crashed = [run for run in single_level_runs if run.state in ['running','finished']] game_runs = [run for run in non_crashed if Config.ENVIRONMENT in run.name] wandb_save_dir = '%s/%s'%(Config.RESTORE_PATH,Config.ENVIRONMENT) print('Save dir: %s'%wandb_save_dir) if not os.path.isdir(wandb_save_dir): import requests for run in game_runs: level_id = run.name.split('__')[-1] run_save_dir = wandb_save_dir + '/' + level_id if not os.path.isdir(run_save_dir): os.makedirs(run_save_dir) def save_wandb_file(name): url = "https://api.wandb.ai/files/ssl_rl/procgen_generalization/%s/%s"%(run.id,name) r = requests.get(url) with open(run_save_dir+'/%s'%name , 'wb') as fh: fh.write(r.content) save_wandb_file('checkpoint') save_wandb_file('ppo-1.data-00000-of-00001') save_wandb_file('ppo-1.index') save_wandb_file('ppo-1.meta') print('Downloaded level id %s to %s (run id: %s)' % (level_id,run_save_dir,run.id) ) print(os.listdir(run_save_dir)) # wandb.restore(wandb_save_dir+"/checkpoint",run_path='/'.join(run.path)) # load in just the graph and model parameters outside for-loop from coinrun import policies as policies_ppo ppo = policies_ppo.get_policy() ppo_graph_1, ppo_graph_2 = tf.Graph(), tf.Graph() PSE_policy = Config.PSE_POLICY if PSE_policy == 'ppo_2': levels = np.unique(os.listdir(wandb_save_dir)).astype(int) if Config.ENVIRONMENT == 'bigfish': levels = np.setdiff1d(levels,np.array([4])) pse_replay = [] for mdp_id in levels: print('Collecting MDP %d'%mdp_id) mb_obs_i, mb_actions_i, mb_rewards_i = generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782) pse_replay.append([mb_obs_i, mb_actions_i, mb_rewards_i]) for update in range(start_update+1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) # mpi_print('collecting rollouts...') run_tstart = time.time() packed = runner.run(update_frac=update/nupdates) obs, returns, masks, actions, values, neglogpacs, infos, rewards, epinfos, eval_epinfos = packed values_i = returns_i = states_nce = anchors_nce = labels_nce = actions_nce = neglogps_nce = rewards_nce = infos_nce = None """ PSE data re-collection 1. Make 2 envs for respective policies for 2 random levels """ levels = np.unique(os.listdir(wandb_save_dir)).astype(int) if Config.ENVIRONMENT == 'bigfish': levels = np.setdiff1d(levels,np.array([4])) mdp_1,mdp_2 = np.random.choice(levels,size=2,replace=False) # import ipdb;ipdb.set_trace() observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) gym3_env_eval_1 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_1), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval_1 = FakeEnv(gym3_env_eval_1, observation_space, action_space) venv_eval_1 = VecExtractDictObs(venv_eval_1, "rgb") venv_eval_1 = VecMonitor( venv=venv_eval_1, filename=None, keep_buf=100, ) venv_eval_1 = VecNormalize(venv=venv_eval_1, ob=False) venv_eval_1 = wrappers.add_final_wrappers(venv_eval_1) gym3_env_eval_2 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_2), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval_2 = FakeEnv(gym3_env_eval_2, observation_space, action_space) venv_eval_2 = VecExtractDictObs(venv_eval_2, "rgb") venv_eval_2 = VecMonitor( venv=venv_eval_2, filename=None, keep_buf=100, ) venv_eval_2 = VecNormalize(venv=venv_eval_2, ob=False) venv_eval_2 = wrappers.add_final_wrappers(venv_eval_2) def random_policy(states): actions = np.random.randint(0,15,Config.NUM_ENVS) return actions # print('Loading weights from %s'%(wandb_save_dir+'/%d/ppo-1'%mdp_1)) # with ppo_graph.as_default(): # ppo_model = ppo(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') #import ipdb;ipdb.set_trace() # NOTE: this is recreating a graph within the updates, I'm moving them outside the training loop if PSE_policy == 'ppo': print('Using pretrained PPO policy') model1_path = wandb_save_dir+'/%d/ppo-1'%mdp_1 model2_path = wandb_save_dir+'/%d/ppo-1'%mdp_2 graph_one_vars = ppo_graph_1.get_all_collection_keys() with tf.compat.v1.Session(graph=ppo_graph_1,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1: with tf.compat.v1.variable_scope("model_1"): ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') initialize = tf.compat.v1.global_variables_initializer() sess_1.run(initialize) model_saver = tf.train.import_meta_graph(model1_path+'.meta') model_saver.restore(sess_1, save_path=model1_path) mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval_1,nsteps=32, param_vals='pretrained') with tf.compat.v1.Session(graph=ppo_graph_2,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_2: with tf.compat.v1.variable_scope("model_2"): ppo_model_2 = ppo(sess_2, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo') initialize = tf.compat.v1.global_variables_initializer() sess_2.run(initialize) model_saver = tf.train.import_meta_graph(model2_path+'.meta') model_saver.restore(sess_2, save_path=model2_path) mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(ppo_model_2,venv_eval_2,nsteps=32, param_vals='pretrained') elif PSE_policy == 'random': print('Using random uniform policy') mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(random_policy,venv_eval_1,nsteps=32, param_vals='random') mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(random_policy,venv_eval_2,nsteps=32, param_vals='random') elif PSE_policy == 'ppo_2': mdp_1,mdp_2 = np.random.choice(np.arange(len(pse_replay)),size=2,replace=False) mb_obs_1, mb_actions_1, mb_rewards_1 = pse_replay[mdp_1] mb_obs_2, mb_actions_2, mb_rewards_2 = pse_replay[mdp_2] # reshape our augmented state vectors to match first dim of observation array # (mb_size*num_envs, 64*64*RGB) # (mb_size*num_envs, num_actions) avg_value = np.mean(values) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) eval_epinfobuf100.extend(eval_epinfos) run_elapsed = time.time() - run_tstart run_t_total += run_elapsed # mpi_print('rollouts complete') mblossvals = [] # mpi_print('updating parameters...') train_tstart = time.time() mean_cust_loss = 0 inds = np.arange(nbatch) inds_pse = np.arange(1024) inds_nce = np.arange(nbatch//runner.nce_update_freq) for _ in range(noptepochs): np.random.shuffle(inds) np.random.shuffle(inds_nce) for start in range(0, nbatch, nbatch_train): sess.run([model.train_model.train_dropout_assign_ops]) end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards)) slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1)) slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2)) mblossvals.append(model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='policy')) slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards)) np.random.shuffle(inds_pse) slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1)) slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2)) model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='pse') # update the dropout mask sess.run([model.train_model.train_dropout_assign_ops]) sess.run([model.train_model.run_dropout_assign_ops]) train_elapsed = time.time() - train_tstart train_t_total += train_elapsed # mpi_print('update complete') lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: step = update*nbatch eval_rew_mean = utils.process_ep_buf(eval_active_ep_buf, tb_writer=tb_writer, suffix='_eval', step=step) rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step) ep_len_mean = np.nanmean([epinfo['l'] for epinfo in active_ep_buf]) mpi_print('\n----', update) mean_rewards.append(rew_mean_10) datapoints.append([step, rew_mean_10]) tb_writer.log_scalar(ep_len_mean, 'ep_len_mean', step=step) tb_writer.log_scalar(fps, 'fps', step=step) tb_writer.log_scalar(avg_value, 'avg_value', step=step) tb_writer.log_scalar(mean_cust_loss, 'custom_loss', step=step) mpi_print('time_elapsed', tnow - tfirststart, run_t_total, train_t_total) mpi_print('timesteps', update*nsteps, total_timesteps) # eval_rew_mean = episode_rollouts(eval_env,model,step,tb_writer) mpi_print('eplenmean', ep_len_mean) mpi_print('eprew', rew_mean_10) mpi_print('eprew_eval', eval_rew_mean) mpi_print('fps', fps) mpi_print('total_timesteps', update*nbatch) mpi_print([epinfo['r'] for epinfo in epinfobuf10]) rep_loss = 0 if len(mblossvals): for (lossval, lossname) in zip(lossvals, model.loss_names): mpi_print(lossname, lossval) tb_writer.log_scalar(lossval, lossname, step=step) mpi_print('----\n') wandb.log({"%s/eprew"%(Config.ENVIRONMENT):rew_mean_10, "%s/eprew_eval"%(Config.ENVIRONMENT):eval_rew_mean, "%s/custom_step"%(Config.ENVIRONMENT):step}) if can_save: if save_interval and (update % save_interval == 0): save_model() for j, checkpoint in enumerate(checkpoints): if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)): saved_key_checkpoints[j] = True save_model(str(checkpoint) + 'M') save_model() env.close() # import subprocess # wandb_files = os.listdir('wandb') # file_to_save = '' # for fn in wandb_files: # if str(run_id) in fn: # file_to_save = fn # break # print(file_to_save) # my_env = os.environ.copy() # my_env["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1" # subprocess.call(['wandb','sync','wandb/'+ file_to_save],env=my_env) return mean_rewards
def main(): print('Parsing args') args = setup_utils.setup_and_load() print('Setting up MPI') comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) print('Setting config') # coinrun version, allows you to specify how many GPUs you want this run to use #utils.setup_mpi_gpus() # baselines version, just sets the number of GPUs to the -n flag #setup_mpi_gpus() os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(25e6) elif Config.SHORT_TRAINING: total_timesteps = int(8e6) elif Config.VERY_SHORT_TRAINING: total_timesteps = int(500e3) elif Config.VERY_VERY_SHORT_TRAINING: total_timesteps = int(50e3) save_interval = args.save_interval #env = utils.make_general_env(nenvs, seed=rank) #print (env) mpi_print(Config.ENVIRONMENT) venv, venv_train, venv_adapt = make_env(total_timesteps//2) #switch "easy" -> "exploration" halfway # import ipdb;ipdb.set_trace() observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255)) action_space = DiscreteG(15) # baseline_vec_eval = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE) venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space) venv_eval = VecExtractDictObs(venv_eval, "rgb") venv_eval = VecMonitor( venv=venv_eval, filename=None, keep_buf=100, ) venv_eval = VecNormalize(venv=venv_eval, ob=False) venv_eval = wrappers.add_final_wrappers(venv_eval) with tf.compat.v1.Session(config=config) as sess: if Config.AGENT == 'ppo': from coinrun import ppo2 as agent from coinrun import policies elif Config.AGENT == 'ppo_rnd': from coinrun import ppo2_rnd as agent from coinrun import policies elif Config.AGENT == 'ppo_diayn': from coinrun import ppo2_diayn as agent from coinrun import policies elif Config.AGENT == 'ppg': from coinrun import ppo2_ppg as agent from coinrun import policies elif Config.AGENT == 'ppg_ssl': from coinrun import ppo2_ppg_ssl as agent from coinrun import policies elif Config.AGENT == 'ppo_goal': from coinrun import ppo2_goal as agent from coinrun import policies elif Config.AGENT == 'ppo_curl': from coinrun import ppo2_curl as agent from coinrun import policies elif Config.AGENT == 'ppo_goal_bogdan' or Config.AGENT == 'ppo_ctrl': from coinrun import ppo2_goal_bogdan as agent from coinrun import policies_bogdan as policies elif Config.AGENT == 'ppg_cluster': from coinrun import ppo2_ppg_sinkhorn as agent from coinrun import policies_ppg_sinkhorn as policies elif Config.AGENT == 'ppo_bisimulation': from coinrun import ppo2_bisimulation as agent from coinrun import policies_bisimulation as policies elif Config.AGENT == 'ppo_pse': from coinrun import ppo2_pse as agent from coinrun import policies_pse as policies policy = policies.get_policy() final_eprew_eval = agent.learn(policy=policy, env=venv, eval_env=venv_eval, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, #10, ent_coef=Config.ENTROPY_COEFF, lr=lambda f : f * Config.LEARNING_RATE, lr_ctrl=lambda f : f * Config.LEARNING_RATE_CTRL, lr_myow=lambda f : f * Config.LEARNING_RATE_MYOW, cliprange=lambda f : f * 0.2, total_timesteps=total_timesteps) return final_eprew_eval
def make_env(): env = utils.make_general_env(nenvs, seed=rank) env = wrappers.add_final_wrappers(env) return env
def enjoy_env_sess(sess, checkpoint, overlap): #base_name = str(8*checkpoint) + 'M' #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name) should_eval = True mpi_print('test levels seed', Config.SET_SEED) mpi_print('test levels ', Config.NUM_LEVELS) rep_count = 50 env = utils.make_general_env(20) env = wrappers.add_final_wrappers(env) nenvs = env.num_envs sess.run(tf.global_variables_initializer()) args_now = Config.get_args_dict() #args_run = utils.load_args() agent = create_act_model(sess, env, nenvs) # load name is specified by config.RESTORE_ID adn return True/False if checkpoint != 32: base_name = str(8 * checkpoint) + 'M' elif checkpoint == 0: mean_score = 0.0 succ_rate = 0.0 wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) return mean_score, succ_rate else: base_name = None sess.run(tf.global_variables_initializer()) # env init here load_file = setup_utils.restore_file(Config.RESTORE_ID, overlap_config=overlap, base_name=base_name) is_loaded = utils.load_params_for_scope(sess, 'model') if not is_loaded: mpi_print('NO SAVED PARAMS LOADED') return mean_score, succ_rate obs = env.reset() t_step = 0 scores = np.zeros((nenvs, rep_count)) eplens = np.zeros((nenvs, rep_count)) #scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) # curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) def rollout(obs, state, done): """rollout for rep * nenv times and return scores""" t = 0 count = 0 rews = np.zeros((nenvs, rep_count)) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) rews[:, count] += rew t += 1 for i, d in enumerate(done): if d: eplens[i][count] = t if score_counts[i] < rep_count: score_counts[i] += 1 count = score_counts[i] - 1 # aux score if 'episode' in info[i]: scores[i][count] = info[i].get('episode')['r'] return scores, rews, eplens if is_loaded: mpi_print(load_file) scores, rews, eplens = rollout(obs, state, done) size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() if size == 1: if rank == 0: testset_size = rep_count * nenvs utils.save_pickle(scores, Config.LOGDIR + 'scores') mean_score = np.sum(scores) / testset_size succ_rate = np.sum(scores == 10.0) / testset_size mpi_print('cpus ', size) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) mpi_print('mean score', mean_score) wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) #mpi_print('mean score of each env',[np.mean(s) for s in scores]) else: testset_size = rep_count * nenvs succ = np.sum(scores=10.0) / testset_size succ_rate = utils.mpi_average([succ]) mean_score_tmp = np.sum(scores) / testset_size mean_score = utils.mpi_average([mean_score_tmp]) if rank == 0: mpi_print('testset size', rep_count * nenvs * size) mpi_print('load file name', load_file) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) mpi_print('mean score', mean_score) wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate}) return mean_score, succ_rate
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) # coinrun version, allows you to specify how many GPUs you want this run to use #utils.setup_mpi_gpus() # baselines version, just sets the number of GPUs to the -n flag #setup_mpi_gpus() os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(200e6) elif Config.SHORT_TRAINING: #total_timesteps = int(120e6) total_timesteps = int(25e6) elif Config.VERY_SHORT_TRAINING: total_timesteps = int(5e6) save_interval = args.save_interval #env = utils.make_general_env(nenvs, seed=rank) #print (env) print(Config.ENVIRONMENT) baseline_vec = ProcgenEnv(num_envs=nenvs, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode="easy") gym3_env = ProcgenGym3Env(num=nenvs, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode="easy") venv = FakeEnv(gym3_env, baseline_vec) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) #sys.exit(0) with tf.Session(config=config) as sess: #env = wrappers.add_final_wrappers(env) venv = wrappers.add_final_wrappers(venv) policy = policies.get_policy() #sess.run(tf.global_variables_initializer()) ppo2.learn( policy=policy, env=venv, #env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
from collections import deque import gym import cv2 import os import coinrun.main_utils as utils from coinrun import setup_utils, policies, wrappers, ppo2 from coinrun.config import Config #from gym.envs.classic_control import rendering from collections import deque import random from image_bco import ImageBCO utils.setup_mpi_gpus() setup_utils.setup_and_load() game = utils.make_general_env(1) game = wrappers.add_final_wrappers(game) game.reset() args.checkpoint = 'coin_ilpo' args.input_dir = 'final_models/coin' args.exp_dir = 'results/final_coin_bco' args.n_actions = 4 args.real_actions = 4 args.policy_lr = .0001 args.batch_size = 100 args.ngf = 15 states = [] next_states = [] FINAL_EPSILON = .2 # final value of epsilon INITIAL_EPSILON = .2 # starting value of epsilon EXPLORE = 1000
def enjoy_env_sess(sess, DIR_NAME): should_render = True should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL rep_count = Config.REP mpi_print = utils.mpi_print file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID) f_io = open(file_name, 'a') if should_eval: if Config.TEST_NUM_EVAL > -1: env = utils.make_general_env(Config.TEST_NUM_EVAL) else: env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs agent = create_act_model(sess, env, nenvs) num_actions = env.action_space.n init_rand = tf.variables_initializer( [v for v in tf.global_variables() if 'randcnn' in v.name]) sess.run(tf.compat.v1.global_variables_initializer()) soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions], name='soft_numpy') dist = tfp.distributions.Categorical(probs=soft_numpy) sampled_action = dist.sample() loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) sess.run(init_rand) while should_continue(): if Config.USE_LSTM == 8425 or Config.USE_LSTM == 1081: q_actions, values, state, _ = agent.step(obs, state, done) # e-greedy greedy_flag = np.random.rand(q_actions.shape[0]) greedy_flag = greedy_flag < 0.1 greedy_flag.astype(np.int) random_actions = np.random.randint(0, num_actions, size=q_actions.shape[0]) action = random_actions * greedy_flag + (1 - greedy_flag) * q_actions else: total_soft = agent.get_softmax(obs, state, done) action = sess.run([sampled_action], {soft_numpy: total_soft}) action = action[0] #action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) #scipy.misc.imsave('raw_inputs.png', obs[0]) #print(dd) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = 0 if should_eval: mean_score = np.mean(scores) / rep_count max_idx = np.argmax(scores) result = mean_score f_io.write("{}\n".format(result)) f_io.close() return result