def main(): args = setup_utils.setup_and_load() setup_utils.load_for_setup_if_necessary() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes=" baseline train", tags=["baseline", Config.RUN_ID.split('-')[0]], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu') utils.mpi_print(args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs is how many envs run parallel on a cpu # VenEnv class allows parallel rollout nenvs = Config.NUM_ENVS total_timesteps = int(256 * 10**6) env = utils.make_general_env(nenvs, seed=rank) utils.mpi_print('Set up env') with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies_back.get_policy() #policy = policies.get_policy() utils.mpi_print('Set up policy') learn_func(policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, total_timesteps=total_timesteps)
def main(): utils.setup_mpi_gpus() setup_utils.setup_and_load(num_levels=0, starting_level=0, paint_vel_info=1, restore_id='start0numlev250_256mts', train_eval =True, test_eval = False, num_eval=100, high_difficulty=False) print("High difficulty: " + str(Config.HIGH_DIFFICULTY)) frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options) nogpu_config = tf.ConfigProto(device_count = {'GPU': 0}) with tf.Session(config=nogpu_config) as sess: #with tf.Session(config=frac_gpu_config) as sess: enjoy_env_sess(sess)
def main(): utils.setup_mpi_gpus() setup_utils.setup_and_load() DIR_NAME = Config.TEST_LOG_NAME if not os.path.exists(DIR_NAME): os.makedirs(DIR_NAME) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: results = enjoy_env_sess(sess, DIR_NAME) print(results)
def main(): args = setup_utils.setup_and_load(num_levels=250, starting_level=0, paint_vel_info=1, run_id='start0numlev250_256mts_dann_low', num_envs=32) comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() #config = tf.ConfigProto() frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options) nogpu_config = tf.ConfigProto(device_count={'GPU': 0}) #config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS print("Num envs: " + str(Config.NUM_ENVS)) total_timesteps = int(256e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=frac_gpu_config): #with tf.Session(config=nogpu_config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes="network randomization", tags=["baseline"], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(256e6) env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = nr_policies.get_policy() nr_ppo2.learn(policy=policy, env=env, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(200e6) elif Config.SHORT_TRAINING: total_timesteps = int(120e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f : f * Config.LEARNING_RATE, cliprange=lambda f : f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) main_utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank)) print("load path:") print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID)) act = deepq.learn( env, network="conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], total_timesteps=0, load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID) # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID) ) num_episodes = 500 # while True: episode_rew_ls = [] for i in range(num_episodes): obs, done = env.reset(), False episode_rew = 0 while not done: if Config.RENDER: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew episode_rew_ls.append(episode_rew) print("Episode reward", episode_rew) print("Avg episode reward", np.mean(episode_rew_ls)) print("Var episode reward", np.std(episode_rew_ls))
def main(): utils.setup_mpi_gpus() setup_utils.setup_and_load() with tf.Session() as sess: enjoy_env_sess(sess)
def main(sess): comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 if Config.EXTRACT_SEED != -1: seed = Config.EXTRACT_SEED if Config.EXTRACT_RANK != -1: rank = Config.EXTRACT_RANK set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 use_policy = (Config.RESTORE_ID != '') nenvs = Config.NUM_ENVS total_timesteps = int(502e6) env = utils.make_general_env(nenvs, seed=rank) if use_policy: agent = create_act_model(sess, env, nenvs) sess.run(tf.compat.v1.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') # make directory DIR_NAME = './VAE/records/' if not os.path.exists(DIR_NAME): os.makedirs(DIR_NAME, exist_ok=True) # set file name filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz" with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) obs[:] = env.reset() dones = [False for _ in range(nenv)] # remove noisy inputs actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) obs[:], rewards, dones, _ = env.step(actions) state = agent.initial_state mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[] # For n in range number of steps for _ in range(400): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init if use_policy: actions, _, _, _ = agent.step(obs, state, dones) else: actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) mb_obs.append(obs.copy()) mb_actions.append(actions) mb_dones.append(dones) # Take actions in env and look the results # Infos contains a ton of useful informations obs[:], rewards, dones, _ = env.step(actions) mb_next_obs.append(obs.copy()) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=obs.dtype) mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_dones = np.asarray(mb_dones, dtype=np.bool) #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones) np.savez_compressed(filename, obs=mb_obs) return filename
if use_policy: actions, _, _, _ = agent.step(obs, state, dones) else: actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) mb_obs.append(obs.copy()) mb_actions.append(actions) mb_dones.append(dones) # Take actions in env and look the results # Infos contains a ton of useful informations obs[:], rewards, dones, _ = env.step(actions) mb_next_obs.append(obs.copy()) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=obs.dtype) mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_dones = np.asarray(mb_dones, dtype=np.bool) #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones) np.savez_compressed(filename, obs=mb_obs) return filename if __name__ == '__main__': utils.setup_mpi_gpus() setup_utils.setup_and_load() with tf.compat.v1.Session() as sess: main(sess)
def main(): # general setup os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # perpare directory sub_dir = utils.file_to_path(Config.get_save_file(base_name="tmp")) if os.path.isdir(sub_dir): shutil.rmtree(path=sub_dir) os.mkdir(sub_dir) # hyperparams nenvs = Config.NUM_ENVS total_timesteps = Config.TIMESTEPS population_size = Config.POPULATION_SIZE timesteps_per_agent = Config.TIMESTEPS_AGENT worker_count = Config.WORKER_COUNT passthrough_perc = Config.PASSTHROUGH_PERC mutating_perc = Config.MUTATING_PERC # create environment def make_env(): env = utils.make_general_env(nenvs, seed=rank) env = wrappers.add_final_wrappers(env) return env # setup session and workers, and therefore tensorflow ops graph = tf.get_default_graph() sess = tf.Session(graph=graph) policy = policies.get_policy() workers = [ Worker(sess, i, nenvs, make_env, policy, sub_dir) for i in range(worker_count) ] tb_writer = TB_Writer(sess) def clean_exit(): for worker in workers: Thread.join(worker.thread) utils.mpi_print("") utils.mpi_print("== total duration", "{:.1f}".format(time.time() - t_first_start), " s ==") utils.mpi_print(" exit...") # save best performing agent population.sort(key=lambda k: k['fit'], reverse=True) workers[0].restore_model(name=population[0]["name"]) workers[0].dump_model() # cleanup sess.close() shutil.rmtree(path=sub_dir) # load data from restore point and seed the whole population loaded_name = None if workers[0].try_load_model(): loaded_name = str(uuid.uuid1()) workers[0].save_model(name=loaded_name) # initialise population # either all random and no mutations pending # or all from restore point with all but one to be mutated population = [{ "name": loaded_name or str(uuid.uuid1()), "fit": -1, "need_mut": loaded_name != None and i != 0, "age": -1, "mean_ep_len": -1 } for i in range(population_size)] utils.mpi_print("== population size", population_size, ", t_agent ", timesteps_per_agent, " ==") t_first_start = time.time() try: # main loop generation = 0 timesteps_done = 0 while timesteps_done < total_timesteps: t_generation_start = time.time() utils.mpi_print("") utils.mpi_print("__ Generation", generation, " __") # initialise and evaluate all new agents for agent in population: #if agent["fit"] < 0: # test/ if True: # test constant reevaluation, to dismiss "lucky runs" -> seems good # pick worker from pool and let it work on the agent not_in_work = True while not_in_work: for worker in workers: if worker.can_take_work(): worker.work(agent, timesteps_per_agent) not_in_work = False break timesteps_done += timesteps_per_agent * nenvs for worker in workers: Thread.join(worker.thread) # sort by fitness population.sort(key=lambda k: k["fit"], reverse=True) # print stuff fitnesses = [agent["fit"] for agent in population] ages = [agent["age"] for agent in population] ep_lens = [agent["mean_ep_len"] for agent in population] utils.mpi_print(*["{:5.3f}".format(f) for f in fitnesses]) utils.mpi_print(*["{:5}".format(a) for a in ages]) utils.mpi_print("__ average fit", "{:.1f}".format( np.mean(fitnesses)), ", t_done", timesteps_done, ", took", "{:.1f}".format(time.time() - t_generation_start), "s", ", total", "{:.1f}".format(time.time() - t_first_start), "s __") # log stuff tb_writer.log_scalar(np.mean(fitnesses), "mean_fit", timesteps_done) tb_writer.log_scalar(np.median(fitnesses), "median_fit", timesteps_done) tb_writer.log_scalar(np.max(fitnesses), "max_fit", timesteps_done) tb_writer.log_scalar(np.mean(ages), "mean_age", timesteps_done) ep_lens_mean = np.nanmean(ep_lens) if (ep_lens_mean): tb_writer.log_scalar(ep_lens_mean, "mean_ep_lens", timesteps_done) # cleanup to prevent disk clutter to_be_removed = set( re.sub(r'\..*$', '', f) for f in os.listdir(sub_dir)) - set( [agent["name"] for agent in population]) for filename in to_be_removed: os.remove(sub_dir + "/" + filename + ".index") os.remove(sub_dir + "/" + filename + ".data-00000-of-00001") # break when times up if not timesteps_done < total_timesteps: break # mark weak agents for replacement cutoff_passthrough = math.floor(population_size * passthrough_perc) cutoff_mutating = math.floor(population_size * mutating_perc) source_agents = population[:cutoff_mutating] new_population = population[:cutoff_passthrough] k = 0 while len(new_population) < population_size: new_agent = { "name": source_agents[k] ["name"], # Take name from source agent, so mutation knows the parent "fit": -1, "need_mut": True, "age": 0 } new_population.append(new_agent) k = (k + 1) % len(source_agents) population = new_population generation += 1 clean_exit() except KeyboardInterrupt: clean_exit() return 0
def train(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) main_utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank)) print("==================================") print("Learning rate :{}, batch size: {}".format(Config.LR, Config.BATCH_SIZE)) act = deepq.learn( env, # network=Config.ARCHITECTURE, network="conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], lr=Config.LR, batch_size=Config.BATCH_SIZE, gamma=0.99, total_timesteps=Config.TOTAL_TIMESTEPS, buffer_size=Config.BUFFER_SIZE, print_freq=10, checkpoint_freq=Config.CHECKPOINT_FREQ, checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID), # render=Config.RENDER, callback=None, exploration_fraction=0.5, exploration_final_eps=0.1, prioritized_replay=True, train_freq=4, learning_starts=10000, target_network_update_freq=1000) # act = deepq.learn( # env, # # network=Config.ARCHITECTURE, # network="impala_cnn", # lr=Config.LR, # batch_size=Config.BATCH_SIZE, # gamma=0.99, # total_timesteps=Config.TOTAL_TIMESTEPS, # buffer_size=Config.BUFFER_SIZE, # print_freq=10, # checkpoint_freq=Config.CHECKPOINT_FREQ, # checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID), # # render=Config.RENDER, # callback=None, # exploration_fraction=0.6, # exploration_final_eps=0.04, # prioritized_replay=True, # train_freq=4, # learning_starts=10000, # target_network_update_freq=1000 # ) print("Saving model to {}/saved_models".format(Config.SAVE_PATH)) act.save("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))