コード例 #1
0
def main():
    args = setup_utils.setup_and_load()
    setup_utils.load_for_setup_if_necessary()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes=" baseline train",
               tags=["baseline", Config.RUN_ID.split('-')[0]],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()
    utils.mpi_print('Set up gpu')
    utils.mpi_print(args)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs is how many envs run parallel on a cpu
    # VenEnv class allows parallel rollout
    nenvs = Config.NUM_ENVS
    total_timesteps = int(256 * 10**6)

    env = utils.make_general_env(nenvs, seed=rank)
    utils.mpi_print('Set up env')

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = policies_back.get_policy()
        #policy = policies.get_policy()
        utils.mpi_print('Set up policy')

        learn_func(policy=policy,
                   env=env,
                   log_interval=args.log_interval,
                   save_interval=args.save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=Config.GAE_LAMBDA,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   ent_coef=Config.ENTROPY_COEFF,
                   vf_coef=Config.VF_COEFF,
                   max_grad_norm=Config.MAX_GRAD_NORM,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * Config.CLIP_RANGE,
                   total_timesteps=total_timesteps)
コード例 #2
0
def main():
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load(num_levels=0, starting_level=0, paint_vel_info=1,
            restore_id='start0numlev250_256mts', train_eval =True, test_eval = False, num_eval=100, high_difficulty=False)
    print("High difficulty: " + str(Config.HIGH_DIFFICULTY))
    frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options)
    nogpu_config = tf.ConfigProto(device_count = {'GPU': 0})
    with tf.Session(config=nogpu_config) as sess:
    #with tf.Session(config=frac_gpu_config) as sess:
        enjoy_env_sess(sess)
コード例 #3
0
def main():
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load()
    DIR_NAME = Config.TEST_LOG_NAME

    if not os.path.exists(DIR_NAME):
        os.makedirs(DIR_NAME)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.compat.v1.Session(config=config) as sess:
        results = enjoy_env_sess(sess, DIR_NAME)
        print(results)
コード例 #4
0
def main():
    args = setup_utils.setup_and_load(num_levels=250,
                                      starting_level=0,
                                      paint_vel_info=1,
                                      run_id='start0numlev250_256mts_dann_low',
                                      num_envs=32)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    #config = tf.ConfigProto()
    frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options)
    nogpu_config = tf.ConfigProto(device_count={'GPU': 0})
    #config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    print("Num envs: " + str(Config.NUM_ENVS))
    total_timesteps = int(256e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=frac_gpu_config):
        #with tf.Session(config=nogpu_config):
        env = wrappers.add_final_wrappers(env)

        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                   env=env,
                   save_interval=save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=0.95,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   log_interval=1,
                   ent_coef=Config.ENTROPY_COEFF,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * 0.2,
                   total_timesteps=total_timesteps)
コード例 #5
0
ファイル: nr_train_agent.py プロジェクト: Desein-Yang/GARL
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes="network randomization",
               tags=["baseline"],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(256e6)

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = nr_policies.get_policy()

        nr_ppo2.learn(policy=policy,
                      env=env,
                      save_interval=args.save_interval,
                      nsteps=Config.NUM_STEPS,
                      nminibatches=Config.NUM_MINIBATCHES,
                      lam=0.95,
                      gamma=Config.GAMMA,
                      noptepochs=Config.PPO_EPOCHS,
                      log_interval=1,
                      ent_coef=Config.ENTROPY_COEFF,
                      lr=lambda f: f * Config.LEARNING_RATE,
                      cliprange=lambda f: f * 0.2,
                      total_timesteps=total_timesteps)
コード例 #6
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(200e6)
    elif Config.SHORT_TRAINING:
        total_timesteps = int(120e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        
        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                    env=env,
                    save_interval=save_interval,
                    nsteps=Config.NUM_STEPS,
                    nminibatches=Config.NUM_MINIBATCHES,
                    lam=0.95,
                    gamma=Config.GAMMA,
                    noptepochs=Config.PPO_EPOCHS,
                    log_interval=1,
                    ent_coef=Config.ENTROPY_COEFF,
                    lr=lambda f : f * Config.LEARNING_RATE,
                    cliprange=lambda f : f * 0.2,
                    total_timesteps=total_timesteps)
コード例 #7
0
ファイル: evaluate_dqn.py プロジェクト: cyrilli/coinrun
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    main_utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank))
    print("load path:")
    print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))
    act = deepq.learn(
        env,
        network="conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        total_timesteps=0,
        load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH,
                                                  Config.RUN_ID)
        # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID)
    )

    num_episodes = 500
    # while True:
    episode_rew_ls = []
    for i in range(num_episodes):
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if Config.RENDER:
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        episode_rew_ls.append(episode_rew)
        print("Episode reward", episode_rew)
    print("Avg episode reward", np.mean(episode_rew_ls))
    print("Var episode reward", np.std(episode_rew_ls))
コード例 #8
0
ファイル: test_agent.py プロジェクト: Desein-Yang/GARL
def main():
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load()
    with tf.Session() as sess:
        enjoy_env_sess(sess)
コード例 #9
0
def main(sess):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000

    if Config.EXTRACT_SEED != -1:
        seed = Config.EXTRACT_SEED
    if Config.EXTRACT_RANK != -1:
        rank = Config.EXTRACT_RANK

    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    use_policy = (Config.RESTORE_ID != '')

    nenvs = Config.NUM_ENVS
    total_timesteps = int(502e6)
    env = utils.make_general_env(nenvs, seed=rank)

    if use_policy:
        agent = create_act_model(sess, env, nenvs)
        sess.run(tf.compat.v1.global_variables_initializer())
        loaded_params = utils.load_params_for_scope(sess, 'model')
        if not loaded_params:
            print('NO SAVED PARAMS LOADED')

    # make directory
    DIR_NAME = './VAE/records/'
    if not os.path.exists(DIR_NAME):
        os.makedirs(DIR_NAME, exist_ok=True)
    
    # set file name
    filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz"
    
    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
        obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
        obs[:] = env.reset()
        dones = [False for _ in range(nenv)]
        
        # remove noisy inputs
        actions = [env.action_space.sample() for _ in range(nenv)]
        actions = np.array(actions)
        obs[:], rewards, dones, _ = env.step(actions)
        state = agent.initial_state
        
        mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[]
        # For n in range number of steps
        for _ in range(400):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            if use_policy:
                actions, _, _, _ = agent.step(obs, state, dones)
            else:
                actions = [env.action_space.sample() for _ in range(nenv)]
            actions = np.array(actions)
            mb_obs.append(obs.copy())
            mb_actions.append(actions)
            mb_dones.append(dones)
            
            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            obs[:], rewards, dones, _ = env.step(actions)
            mb_next_obs.append(obs.copy())
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=obs.dtype)
        mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        
        #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones)
        np.savez_compressed(filename, obs=mb_obs)
        return filename
コード例 #10
0
            if use_policy:
                actions, _, _, _ = agent.step(obs, state, dones)
            else:
                actions = [env.action_space.sample() for _ in range(nenv)]
            actions = np.array(actions)
            mb_obs.append(obs.copy())
            mb_actions.append(actions)
            mb_dones.append(dones)
            
            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            obs[:], rewards, dones, _ = env.step(actions)
            mb_next_obs.append(obs.copy())
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=obs.dtype)
        mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        
        #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones)
        np.savez_compressed(filename, obs=mb_obs)
        return filename
        
if __name__ == '__main__':
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load()
    with tf.compat.v1.Session() as sess:
        main(sess)
コード例 #11
0
def main():
    # general setup

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # perpare directory
    sub_dir = utils.file_to_path(Config.get_save_file(base_name="tmp"))
    if os.path.isdir(sub_dir):
        shutil.rmtree(path=sub_dir)
    os.mkdir(sub_dir)

    # hyperparams
    nenvs = Config.NUM_ENVS
    total_timesteps = Config.TIMESTEPS
    population_size = Config.POPULATION_SIZE
    timesteps_per_agent = Config.TIMESTEPS_AGENT
    worker_count = Config.WORKER_COUNT
    passthrough_perc = Config.PASSTHROUGH_PERC
    mutating_perc = Config.MUTATING_PERC

    # create environment
    def make_env():
        env = utils.make_general_env(nenvs, seed=rank)
        env = wrappers.add_final_wrappers(env)
        return env

    # setup session and workers, and therefore tensorflow ops
    graph = tf.get_default_graph()
    sess = tf.Session(graph=graph)

    policy = policies.get_policy()

    workers = [
        Worker(sess, i, nenvs, make_env, policy, sub_dir)
        for i in range(worker_count)
    ]

    tb_writer = TB_Writer(sess)

    def clean_exit():

        for worker in workers:
            Thread.join(worker.thread)

        utils.mpi_print("")
        utils.mpi_print("== total duration",
                        "{:.1f}".format(time.time() - t_first_start), " s ==")
        utils.mpi_print(" exit...")

        # save best performing agent
        population.sort(key=lambda k: k['fit'], reverse=True)
        workers[0].restore_model(name=population[0]["name"])
        workers[0].dump_model()

        # cleanup
        sess.close()
        shutil.rmtree(path=sub_dir)

    # load data from restore point and seed the whole population
    loaded_name = None
    if workers[0].try_load_model():
        loaded_name = str(uuid.uuid1())
        workers[0].save_model(name=loaded_name)

    # initialise population
    # either all random and no mutations pending
    # or all from restore point with all but one to be mutated
    population = [{
        "name": loaded_name or str(uuid.uuid1()),
        "fit": -1,
        "need_mut": loaded_name != None and i != 0,
        "age": -1,
        "mean_ep_len": -1
    } for i in range(population_size)]

    utils.mpi_print("== population size", population_size, ", t_agent ",
                    timesteps_per_agent, " ==")

    t_first_start = time.time()
    try:
        # main loop
        generation = 0
        timesteps_done = 0
        while timesteps_done < total_timesteps:
            t_generation_start = time.time()

            utils.mpi_print("")
            utils.mpi_print("__ Generation", generation, " __")

            # initialise and evaluate all new agents
            for agent in population:
                #if agent["fit"] < 0: # test/
                if True:  # test constant reevaluation, to dismiss "lucky runs" -> seems good

                    # pick worker from pool and let it work on the agent
                    not_in_work = True
                    while not_in_work:
                        for worker in workers:
                            if worker.can_take_work():
                                worker.work(agent, timesteps_per_agent)
                                not_in_work = False
                                break

                    timesteps_done += timesteps_per_agent * nenvs

            for worker in workers:
                Thread.join(worker.thread)

            # sort by fitness
            population.sort(key=lambda k: k["fit"], reverse=True)

            # print stuff
            fitnesses = [agent["fit"] for agent in population]
            ages = [agent["age"] for agent in population]
            ep_lens = [agent["mean_ep_len"] for agent in population]

            utils.mpi_print(*["{:5.3f}".format(f) for f in fitnesses])
            utils.mpi_print(*["{:5}".format(a) for a in ages])
            utils.mpi_print("__ average fit", "{:.1f}".format(
                np.mean(fitnesses)), ", t_done", timesteps_done, ", took",
                            "{:.1f}".format(time.time() - t_generation_start),
                            "s", ", total",
                            "{:.1f}".format(time.time() - t_first_start),
                            "s __")

            # log stuff
            tb_writer.log_scalar(np.mean(fitnesses), "mean_fit",
                                 timesteps_done)
            tb_writer.log_scalar(np.median(fitnesses), "median_fit",
                                 timesteps_done)
            tb_writer.log_scalar(np.max(fitnesses), "max_fit", timesteps_done)
            tb_writer.log_scalar(np.mean(ages), "mean_age", timesteps_done)
            ep_lens_mean = np.nanmean(ep_lens)
            if (ep_lens_mean):
                tb_writer.log_scalar(ep_lens_mean, "mean_ep_lens",
                                     timesteps_done)

            # cleanup to prevent disk clutter
            to_be_removed = set(
                re.sub(r'\..*$', '', f) for f in os.listdir(sub_dir)) - set(
                    [agent["name"] for agent in population])
            for filename in to_be_removed:
                os.remove(sub_dir + "/" + filename + ".index")
                os.remove(sub_dir + "/" + filename + ".data-00000-of-00001")

            # break when times up
            if not timesteps_done < total_timesteps:
                break

            # mark weak agents for replacement
            cutoff_passthrough = math.floor(population_size * passthrough_perc)
            cutoff_mutating = math.floor(population_size * mutating_perc)
            source_agents = population[:cutoff_mutating]

            new_population = population[:cutoff_passthrough]

            k = 0
            while len(new_population) < population_size:
                new_agent = {
                    "name": source_agents[k]
                    ["name"],  # Take name from source agent, so mutation knows the parent
                    "fit": -1,
                    "need_mut": True,
                    "age": 0
                }
                new_population.append(new_agent)
                k = (k + 1) % len(source_agents)

            population = new_population
            generation += 1

        clean_exit()
    except KeyboardInterrupt:
        clean_exit()

    return 0
コード例 #12
0
def train():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    main_utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank))
    print("==================================")
    print("Learning rate :{}, batch size: {}".format(Config.LR,
                                                     Config.BATCH_SIZE))

    act = deepq.learn(
        env,
        # network=Config.ARCHITECTURE,
        network="conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        lr=Config.LR,
        batch_size=Config.BATCH_SIZE,
        gamma=0.99,
        total_timesteps=Config.TOTAL_TIMESTEPS,
        buffer_size=Config.BUFFER_SIZE,
        print_freq=10,
        checkpoint_freq=Config.CHECKPOINT_FREQ,
        checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID),
        # render=Config.RENDER,
        callback=None,
        exploration_fraction=0.5,
        exploration_final_eps=0.1,
        prioritized_replay=True,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000)
    # act = deepq.learn(
    #                 env,
    #                 # network=Config.ARCHITECTURE,
    #                 network="impala_cnn",
    #                 lr=Config.LR,
    #                 batch_size=Config.BATCH_SIZE,
    #                 gamma=0.99,
    #                 total_timesteps=Config.TOTAL_TIMESTEPS,
    #                 buffer_size=Config.BUFFER_SIZE,
    #                 print_freq=10,
    #                 checkpoint_freq=Config.CHECKPOINT_FREQ,
    #                 checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID),
    #                 # render=Config.RENDER,
    #                 callback=None,
    #                 exploration_fraction=0.6,
    #                 exploration_final_eps=0.04,
    #                 prioritized_replay=True,
    #                 train_freq=4,
    #                 learning_starts=10000,
    #                 target_network_update_freq=1000
    #                 )
    print("Saving model to {}/saved_models".format(Config.SAVE_PATH))
    act.save("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))