Ejemplo n.º 1
0
def generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782):
	ppo_graph = tf.Graph()
	print('Created graph')
	observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
	action_space = DiscreteG(15)

	gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_id), 
									 paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
	venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space)
	venv_eval = VecExtractDictObs(venv_eval, "rgb")
	venv_eval = VecMonitor(
		venv=venv_eval, filename=None, keep_buf=100,
	)
	venv_eval = VecNormalize(venv=venv_eval, ob=False)
	venv_eval = wrappers.add_final_wrappers(venv_eval)
	print('Created env')
	graph_one_vars = ppo_graph.get_all_collection_keys()

	model_path = wandb_save_dir+'/%d/ppo-1'%mdp_id

	with tf.compat.v1.Session(graph=ppo_graph,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1:
		with tf.compat.v1.variable_scope("model_%d"%np.random.randint(0,100000,1).item()):
			ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
			initialize = tf.compat.v1.global_variables_initializer()
			sess_1.run(initialize)
			print('Inited session')
		model_saver = tf.train.import_meta_graph(model_path+'.meta')
		model_saver.restore(sess_1, save_path=model_path)
		print('Restored PPO')
		mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval,nsteps=nsteps_rollout, param_vals='pretrained')
		print('Collected level data')

	venv_eval.close()

	return mb_obs_1, mb_actions_1, mb_rewards_1
Ejemplo n.º 2
0
def run_state_test(env_name):
    env_kwargs = dict(num=2, env_name=env_name, rand_seed=0)
    env = ProcgenGym3Env(**env_kwargs)
    rng = np.random.RandomState(0)
    actions = [
        gym3.types_np.sample(env.ac_space, bshape=(env.num, ), rng=rng)
        for _ in range(NUM_STEPS)
    ]
    ref_rollouts = run_in_subproc(gather_rollouts,
                                  env_kwargs=env_kwargs,
                                  actions=actions)
    assert len(ref_rollouts) == NUM_STEPS + 1

    # run the same thing a second time
    basic_rollouts = run_in_subproc(gather_rollouts,
                                    env_kwargs=env_kwargs,
                                    actions=actions)
    assert_rollouts_identical(ref_rollouts, basic_rollouts)

    # run but save states
    state_rollouts = run_in_subproc(gather_rollouts,
                                    env_kwargs=env_kwargs,
                                    actions=actions,
                                    get_state=True)
    assert_rollouts_identical(ref_rollouts, state_rollouts)

    # make sure states are the same
    state_rollouts_2 = run_in_subproc(gather_rollouts,
                                      env_kwargs=env_kwargs,
                                      actions=actions,
                                      get_state=True)
    assert_rollouts_identical(ref_rollouts, state_rollouts_2)
    assert_rollouts_identical(state_rollouts, state_rollouts_2)

    # save and restore at each timestep
    state_rollouts_3 = run_in_subproc(
        gather_rollouts,
        env_kwargs=env_kwargs,
        actions=actions,
        get_state=True,
        set_state_every_step=True,
    )
    assert_rollouts_identical(ref_rollouts, state_rollouts_3)
    assert_rollouts_identical(state_rollouts, state_rollouts_3)

    # restore a point in the middle of the rollout and make sure that the remainder of the data looks the same
    offset = NUM_STEPS // 2
    state_restore_rollouts = run_in_subproc(
        gather_rollouts,
        env_kwargs={
            **env_kwargs, "rand_seed": 1
        },
        actions=actions[offset:],
        state=state_rollouts[offset]["state"],
        get_state=True,
    )
    assert_rollouts_identical(ref_rollouts[offset:], state_restore_rollouts)
    assert_rollouts_identical(state_rollouts[offset:], state_restore_rollouts)
Ejemplo n.º 3
0
def make_env(steps_per_env):
	observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
	action_space = DiscreteG(15)
	if Config.FIRST_PHASE == 'exploration':
		# baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL)
	else:
		# baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL)
	if Config.SECOND_PHASE == 'exploration':
		# baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT,  paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE)
		gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT,  paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL)
	elif Config.SECOND_PHASE != "None":
		# baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE)
		gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL)
	else:
		baseline_vec_adapt = gym3_env_adapt = None
	
	venv_train = FakeEnv(gym3_env_train, observation_space, action_space)
	venv_train = VecExtractDictObs(venv_train, "rgb")
	if Config.SECOND_PHASE != "None":
		venv_adapt = FakeEnv(gym3_env_adapt, observation_space, action_space)   
		venv_adapt = VecExtractDictObs(venv_adapt, "rgb")
	venv_train = VecMonitor(
		venv=venv_train, filename=None, keep_buf=100,
	)
	if Config.SECOND_PHASE != "None":
		venv_adapt = VecMonitor(
			venv=venv_adapt, filename=None, keep_buf=100,
		)

	venv_train = VecNormalize(venv=venv_train, ob=False)
	venv_train = wrappers.add_final_wrappers(venv_train)
	if Config.SECOND_PHASE != "None":
		venv_adapt = VecNormalize(venv=venv_adapt, ob=False)
		venv_adapt = wrappers.add_final_wrappers(venv_adapt)

		venv = wrappers.DistributionShiftWrapperVec(env_list=[venv_train, venv_adapt], steps_per_env=steps_per_env) 
	else:
		venv = venv_train
		venv_adapt = venv_train = None
		venv.current_env_steps_left = steps_per_env

	return venv, venv_train, venv_adapt
Ejemplo n.º 4
0
def run_episode_gym3_vec_env(u):

    env = ProcgenGym3Env(num=population_size, env_name="heist")
    rewards = np.zeros(population_size)
    for _ in range(number_env_steps):
        env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num,)))
        rew, obs, first = env.observe()

        rewards += rew
    return rewards
Ejemplo n.º 5
0
def run():
    env = ProcgenGym3Env(num=2, env_name="coinrun", render_mode="rgb_array")
    env = gym3.ViewerWrapper(env, info_key="rgb")
    step = 0
    for i in range(100):

        env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num, )))
        rew, obs, first = env.observe()
        print(f"step {step} reward {rew} first {first}")
        step += 1
Ejemplo n.º 6
0
def get_procgen_venv(*, env_id, num_envs, rendering=False, **env_kwargs):
    if rendering:
        env_kwargs["render_human"] = True

    env = ProcgenGym3Env(num=num_envs, env_name=env_id, **env_kwargs)
    print(env)
    env = gym3.ExtractDictObWrapper(env, "rgb")

    if rendering:
        env = gym3.ViewerWrapper(env, info_key="rgb")
    return env
Ejemplo n.º 7
0
def test_multi_speed(env_name, num_envs, benchmark):
    env = ProcgenGym3Env(num=num_envs, env_name=env_name)

    actions = np.zeros([env.num])

    def rollout(max_steps):
        step_count = 0
        while step_count < max_steps:
            env.act(actions)
            env.observe()
            step_count += 1

    benchmark(lambda: rollout(1000))
Ejemplo n.º 8
0
 def collect_observations():
     rng = np.random.RandomState(0)
     env = ProcgenGym3Env(num=2, env_name=env_name, rand_seed=23)
     _, obs, _ = env.observe()
     obses = [obs["rgb"]]
     for _ in range(128):
         env.act(
             rng.randint(low=0,
                         high=env.ac_space.eltype.n,
                         size=(env.num, ),
                         dtype=np.int32))
         _, obs, _ = env.observe()
         obses.append(obs["rgb"])
     return np.array(obses)
def run_experiment(
    experiment_name,
    environment_name,
    log,
    graph,
    random_seeds,
    n_episodes,
    n_steps,
    n_envs,
    epsilon,
    batch_sz,
    critic_lr,
    actor_lr,
    gamma,
    critic_epochs,
):

    exp_path = create_exp_dir(experiment_name)

    agent = PPO(
        actor_lr=actor_lr,
        critic_lr=critic_lr,
        batch_sz=batch_sz,
        gamma=gamma,
        epsilon=epsilon,
        critic_epochs=critic_epochs,
    )

    # agent = RandomAgent(n_envs=n_envs)

    env = ProcgenGym3Env(
        num=n_envs,
        env_name="coinrun",
        render_mode="rgb_array",
        center_agent=False,
        num_levels=1,
        start_level=2,
    )

    train(agent, env, n_episodes, n_steps)
    generate_graphs(agent, exp_path)

    print(len(agent.buffer.mean_reward))
    print(np.array(agent.buffer.mean_reward).shape)
    print(np.stack(agent.buffer.mean_reward).shape)
    print(agent.buffer.mean_reward)

    plt.plot(agent.buffer.mean_reward)
    plt.show()
    """
Ejemplo n.º 10
0
def gather_rollouts(env_kwargs,
                    actions,
                    state=None,
                    get_state=False,
                    set_state_every_step=False):
    env = ProcgenGym3Env(**env_kwargs)
    if state is not None:
        env.callmethod("set_state", state)
    result = [dict(ob=env.observe(), info=env.get_info())]
    if get_state:
        result[-1]["state"] = env.callmethod("get_state")
    if set_state_every_step:
        env.callmethod("set_state", result[-1]["state"])
    for act in actions:
        env.act(act)
        result.append(dict(ob=env.observe(), info=env.get_info()))
        if get_state:
            result[-1]["state"] = env.callmethod("get_state")
        if set_state_every_step:
            env.callmethod("set_state", result[-1]["state"])
    return result
Ejemplo n.º 11
0
def make_interactive(vision, record_dir, **kwargs):
    info_key = None
    ob_key = None
    if vision == "human":
        info_key = "rgb"
        kwargs["render_mode"] = "rgb_array"
    else:
        ob_key = "rgb"

    env = ProcgenGym3Env(num=1, **kwargs)
    if record_dir is not None:
        env = VideoRecorderWrapper(
            env=env, directory=record_dir, ob_key=ob_key, info_key=info_key
        )
    h, w, _ = env.ob_space["rgb"].shape
    return ProcgenInteractive(
        env,
        ob_key=ob_key,
        info_key=info_key,
        width=w * 12,
        height=h * 12,
    )
Ejemplo n.º 12
0
 def make_env(level_num):
     venv = ProcgenGym3Env(num=num_envs,
                           env_name=env_name,
                           num_levels=1,
                           start_level=level_num)
     return venv
			distribution_mode="easy"
			)

		#env = gym3.ViewerWrapper(env, info_key="rgb")

		#env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num,)))
		rew, obs, first = env.observe()
		states.append(obs)
		#print(f"step {step} reward {rew} first {first}")
		#step += 1

print(len(states))
"""
env = ProcgenGym3Env(
    num=1,
    env_name="coinrun",
    render_mode="rgb_array",
    center_agent=False,
    num_levels=1,
    start_level=2,
)

env = gym3.ViewerWrapper(env, info_key="rgb")

for i in tqdm(range(100)):

    env.act(gym3.types_np.sample(env.ac_space, bshape=(env.num, )))
    rew, obs, first = env.observe()
    #states.append(obs)
    #print(f"step {step} reward {rew} first {first}")
    #step += 1
Ejemplo n.º 14
0
def ProcgenEnv(num_envs, env_name, **kwargs):
    return ToBaselinesVecEnv(
        ProcgenGym3Env(num=num_envs, env_name=env_name, **kwargs))
    env_name = "heist"

distribution_mode = config.environment["distribution_mode"]
episode_steps = config.environment["episode_steps"]

reward_sum = 0
number_validation_runs = 100

num_levels_solved = 0

for env_seed in range(number_validation_runs):

    env = ProcgenGym3Env(num=1,
                         env_name=env_name,
                         use_backgrounds=False,
                         distribution_mode=distribution_mode,
                         num_levels=1,
                         start_level=env_seed,
                         render_mode="rgb_array")
    env = gym3.ViewerWrapper(env, info_key="rgb")

    _, ob, _ = env.observe()
    observations = ob["rgb"]
    ob = ep_runner.transform_ob(observations)

    reward = 0

    brain.reset()

    for i in range(episode_steps):
        action = brain.step(ob.flatten())
    def eval_fitness(self,
                     evaluations,
                     episode_steps: int = 500,
                     break_all_episodes: bool = False):
        """

        :param evaluations: List of 3-tuples (individual, env_seed, number_of_rounds)
        :param episode_steps: Number of steps per episode
        :param break_all_episodes: When one episode is done, break all episodes
        :return:
        """
        # Extract parameters, this list of lists is necessary since pool.map only accepts a single argument
        # See here: http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments
        # individual = evaluations[0]
        env_seed = evaluations[0][1]
        number_of_rounds = evaluations[0][2]

        brains = []
        for single_evaluation in evaluations:
            brains.append(
                self.brain_class(input_size=self.input_size,
                                 output_size=self.output_size,
                                 individual=single_evaluation[0],
                                 configuration=self.brain_configuration,
                                 brain_state=self.brain_state))

        fitness_total = 0
        times_episodes = []
        for i in range(number_of_rounds):
            # num_threads=8 can be set here, don't know how it effects performance yet
            env = ProcgenGym3Env(num=len(evaluations),
                                 env_name="heist",
                                 use_backgrounds=False,
                                 distribution_mode=self.distribution_mode,
                                 num_levels=1,
                                 start_level=env_seed + i)

            rew, ob, first = env.observe()
            observations = ob["rgb"]
            ob = self.transform_ob(observations)

            # print(torch.cuda.memory_summary(device=self.device))
            # print("Memory: {}".format(torch.cuda.memory_allocated(device=self.device)))

            # pool = mp.get_context("spawn").Pool(processes=os.cpu_count())

            fitness_current = [0] * len(evaluations)
            # times_actions = []

            time_s = time.time()
            for i in range(episode_steps):

                # actions = pool.starmap(self.get_actions, zip(brains, ob))
                # time_actions_s = time.time()
                actions = self.calculate_actions_trivial(brains, ob)
                # times_actions.append(time.time() - time_actions_s)
                actions = np.argmax(actions, axis=1)

                env.act(actions)
                rew, ob, first = env.observe()

                if any(first) and break_all_episodes:
                    print(
                        "break_episodes: One or more environments are done, stopping all episodes"
                    )
                    break

                observations = ob["rgb"]
                ob = self.transform_ob(observations)
                # print(torch.cuda.memory_summary(device=self.device))
                # print("Memory: {}".format(torch.cuda.memory_allocated(device=self.device)))

                # if i > 10:
                #     break

                fitness_current += rew
            print("Episodes with VecEnv finished")
            # print("Times actions Mean {}".format(np.mean(times_actions)))
            # print("Times actions Std {}".format(np.std(times_actions)))
            # print("Times actions Max {}".format(np.max(times_actions)))
            # print("Times actions Min {}".format(np.min(times_actions)))
            times_episodes.append(time.time() - time_s)
            # break
            fitness_total += fitness_current

        return fitness_total / number_of_rounds, times_episodes
Ejemplo n.º 17
0
def main():
    print('Parsing args')
    args = setup_utils.setup_and_load()
    print('Setting up MPI')
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)
    print('Setting config')
    # coinrun version, allows you to specify how many GPUs you want this run to use
    #utils.setup_mpi_gpus()

    # baselines version, just sets the number of GPUs to the -n flag 
    #setup_mpi_gpus()
    os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS)
    
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101
    
    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(25e6)
    elif Config.SHORT_TRAINING:
        total_timesteps = int(8e6)
    elif Config.VERY_SHORT_TRAINING:
        total_timesteps = int(500e3)
    elif Config.VERY_VERY_SHORT_TRAINING:
        total_timesteps = int(50e3)
    save_interval = args.save_interval

    #env = utils.make_general_env(nenvs, seed=rank)
    #print (env)

    mpi_print(Config.ENVIRONMENT)
    venv, venv_train, venv_adapt = make_env(total_timesteps//2) #switch "easy" -> "exploration" halfway
    # import ipdb;ipdb.set_trace()
    observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
    action_space = DiscreteG(15)
    
    # baseline_vec_eval = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
    gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)

    venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space)
    venv_eval = VecExtractDictObs(venv_eval, "rgb")
    venv_eval = VecMonitor(
        venv=venv_eval, filename=None, keep_buf=100,
    )
    venv_eval = VecNormalize(venv=venv_eval, ob=False)
    venv_eval = wrappers.add_final_wrappers(venv_eval)

    
    with tf.compat.v1.Session(config=config) as sess:
        
        if Config.AGENT == 'ppo':
            from coinrun import ppo2 as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_rnd':
            from coinrun import ppo2_rnd as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_diayn':
            from coinrun import ppo2_diayn as agent
            from coinrun import policies
        elif Config.AGENT == 'ppg':
            from coinrun import ppo2_ppg as agent
            from coinrun import policies
        elif Config.AGENT == 'ppg_ssl':
            from coinrun import ppo2_ppg_ssl as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_goal':
            from coinrun import ppo2_goal as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_curl':
            from coinrun import ppo2_curl as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_goal_bogdan' or Config.AGENT == 'ppo_ctrl':
            from coinrun import ppo2_goal_bogdan as agent
            from coinrun import policies_bogdan as policies
        elif Config.AGENT == 'ppg_cluster':
            from coinrun import ppo2_ppg_sinkhorn as agent
            from coinrun import policies_ppg_sinkhorn as policies
        elif Config.AGENT == 'ppo_bisimulation':
            from coinrun import ppo2_bisimulation as agent
            from coinrun import policies_bisimulation as policies
        elif Config.AGENT == 'ppo_pse':
            from coinrun import ppo2_pse as agent
            from coinrun import policies_pse as policies
        policy = policies.get_policy()

        final_eprew_eval = agent.learn(policy=policy,
                    env=venv,
                    eval_env=venv_eval,
                    save_interval=save_interval,
                    nsteps=Config.NUM_STEPS,
                    nminibatches=Config.NUM_MINIBATCHES,
                    lam=0.95,
                    gamma=Config.GAMMA,
                    noptepochs=Config.PPO_EPOCHS,
                    log_interval=1, #10,
                    ent_coef=Config.ENTROPY_COEFF,
                    lr=lambda f : f * Config.LEARNING_RATE,
					lr_ctrl=lambda f : f * Config.LEARNING_RATE_CTRL,
					lr_myow=lambda f : f * Config.LEARNING_RATE_MYOW,
                    cliprange=lambda f : f * 0.2,
                    total_timesteps=total_timesteps)

        return final_eprew_eval
Ejemplo n.º 18
0
"""
Example random agent script using the gym3 API to demonstrate that procgen works
"""

from gym3 import types_np
from procgen import ProcgenGym3Env

env = ProcgenGym3Env(num=1, env_name="coinrun")
step = 0
while True:
    env.act(types_np.sample(env.ac_space, bshape=(env.num, )))
    rew, obs, first = env.observe()
    print(f"step {step} reward {rew} first {first}")
    if step > 0 and first:
        break
    step += 1
Ejemplo n.º 19
0
def eval(*,
         network,
         seed=None,
         nsteps=2048,
         ent_coef=0.0,
         vf_coef=0.5,
         max_grad_norm=0.5,
         gamma=0.99,
         lam=0.95,
         log_interval=10,
         nminibatches=4,
         noptepochs=4,
         load_path=None,
         model_fn=None,
         update_fn=None,
         init_fn=None,
         mpi_rank_weight=1,
         comm=None,
         policy=None,
         nenvs=None,
         ob_space=None,
         ac_space=None,
         nbatch=None,
         nbatch_train=None,
         model=None,
         num_trials=3,
         num_levels=500,
         start_level=0,
         gui=False,
         args=None,
         **network_kwargs):
    if load_path is not None:
        model.load(load_path)

    if init_fn is not None:
        init_fn()

    for trial in range(num_trials):
        # Start total timer
        tfirststart = time.perf_counter()

        logger.info('Stepping environment...')

        avg_reward = 0
        avg_steps = 0

        for num_level in tqdm(range(start_level, start_level + num_levels)):
            if gui:
                env = ViewerWrapper(ProcgenGym3Env(num=1,
                                                   env_name="fruitbot",
                                                   num_levels=1,
                                                   start_level=num_level,
                                                   distribution_mode='easy',
                                                   render_mode="rgb_array"),
                                    info_key='rgb')
            else:
                env = ProcgenGym3Env(num=1,
                                     env_name="fruitbot",
                                     num_levels=1,
                                     start_level=num_level,
                                     distribution_mode='easy')
            _, obs, _ = env.observe()
            step = 0
            total_reward = 0
            while True:
                actions, _, _, _ = model.step(obs['rgb'])
                env.act(actions)
                rew, obs, first = env.observe()
                total_reward += rew
                if step > 0 and first:
                    break
                step += 1
            avg_reward += total_reward
            avg_steps += step

        avg_reward = avg_reward / num_levels
        avg_steps = avg_steps / num_levels

        logger.info('Done.')

        # End timer
        tnow = time.perf_counter()

        logger.logkv('eval_eprewmean', avg_reward)
        logger.logkv('eval_eplenmean', avg_steps)
        logger.logkv('misc/time_elapsed', tnow - tfirststart)

        logger.dumpkvs()

    return model
Ejemplo n.º 20
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    # coinrun version, allows you to specify how many GPUs you want this run to use
    #utils.setup_mpi_gpus()

    # baselines version, just sets the number of GPUs to the -n flag
    #setup_mpi_gpus()
    os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS

    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(200e6)
    elif Config.SHORT_TRAINING:
        #total_timesteps = int(120e6)
        total_timesteps = int(25e6)
    elif Config.VERY_SHORT_TRAINING:
        total_timesteps = int(5e6)
    save_interval = args.save_interval

    #env = utils.make_general_env(nenvs, seed=rank)
    #print (env)

    print(Config.ENVIRONMENT)

    baseline_vec = ProcgenEnv(num_envs=nenvs,
                              env_name=Config.ENVIRONMENT,
                              num_levels=Config.NUM_LEVELS,
                              paint_vel_info=Config.PAINT_VEL_INFO,
                              distribution_mode="easy")
    gym3_env = ProcgenGym3Env(num=nenvs,
                              env_name=Config.ENVIRONMENT,
                              num_levels=Config.NUM_LEVELS,
                              paint_vel_info=Config.PAINT_VEL_INFO,
                              distribution_mode="easy")
    venv = FakeEnv(gym3_env, baseline_vec)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    #sys.exit(0)
    with tf.Session(config=config) as sess:
        #env = wrappers.add_final_wrappers(env)
        venv = wrappers.add_final_wrappers(venv)

        policy = policies.get_policy()

        #sess.run(tf.global_variables_initializer())
        ppo2.learn(
            policy=policy,
            env=venv,
            #env=env,
            save_interval=save_interval,
            nsteps=Config.NUM_STEPS,
            nminibatches=Config.NUM_MINIBATCHES,
            lam=0.95,
            gamma=Config.GAMMA,
            noptepochs=Config.PPO_EPOCHS,
            log_interval=1,
            ent_coef=Config.ENTROPY_COEFF,
            lr=lambda f: f * Config.LEARNING_RATE,
            cliprange=lambda f: f * 0.2,
            total_timesteps=total_timesteps)
Ejemplo n.º 21
0
def main():
    from procgen import ProcgenGym3Env

    env = ProcgenGym3Env(num=1, env_name="coinrun", render_mode="rgb_array")
    ia = Interactive(env, info_key="rgb", width=768, height=768)
    ia.run()
Ejemplo n.º 22
0
def learn(*, policy, env, eval_env, nsteps, total_timesteps, ent_coef, lr,
			 vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
			log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
			save_interval=0, load_path=None):
	comm = MPI.COMM_WORLD
	rank = comm.Get_rank()
	mpi_size = comm.Get_size()

	#tf.compat.v1.disable_v2_behavior()
	sess = tf.compat.v1.get_default_session()

	if isinstance(lr, float): lr = constfn(lr)
	else: assert callable(lr)
	if isinstance(cliprange, float): cliprange = constfn(cliprange)
	else: assert callable(cliprange)
	total_timesteps = int(total_timesteps)
	
	nenvs = env.num_envs
	ob_space = env.observation_space
	ac_space = env.action_space
	nbatch = nenvs * nsteps
	
	nbatch_train = nbatch // nminibatches
	model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
					nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
					max_grad_norm=max_grad_norm)
	utils.load_all_params(sess)

	runner = Runner(env=env, eval_env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

	epinfobuf10 = deque(maxlen=10)
	epinfobuf100 = deque(maxlen=100)
	eval_epinfobuf100 = deque(maxlen=100)
	tfirststart = time.time()
	active_ep_buf = epinfobuf100
	eval_active_ep_buf = eval_epinfobuf100

	nupdates = total_timesteps//nbatch
	mean_rewards = []
	datapoints = []

	run_t_total = 0
	train_t_total = 0

	can_save = False
	checkpoints = [32, 64]
	saved_key_checkpoints = [False] * len(checkpoints)

	if Config.SYNC_FROM_ROOT and rank != 0:
		can_save = False

	def save_model(base_name=None):
		base_dict = {'datapoints': datapoints}
		utils.save_params_in_scopes(sess, ['model'], Config.get_save_file(base_name=base_name), base_dict)

	# For logging purposes, allow restoring of update
	start_update = 0
	if Config.RESTORE_STEP is not None:
		start_update = Config.RESTORE_STEP // nbatch

	z_iter = 0
	curr_z = np.random.randint(0, high=Config.POLICY_NHEADS)
	tb_writer = TB_Writer(sess)
	import os
	os.environ["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1"
	os.environ["WANDB_SILENT"] = "true"
	run_id = np.random.randint(100000000)
	os.environ["WANDB_RUN_ID"] = str(run_id)
	group_name = "%s__%s__%f__%f" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP)
	name = "%s__%s__%f__%f__%d" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP, run_id)
	wandb.init(project='ising_generalization' if Config.ENVIRONMENT == 'ising' else 'procgen_generalization' ,
			  entity='ssl_rl', config=Config.args_dict,
			  group=group_name, name=name,
			  mode="disabled" if Config.DISABLE_WANDB else "online")

	api = wandb.Api()
	list_runs = api.runs("ssl_rl/procgen_generalization")
	single_level_runs=[run for run in list_runs if 'ppo_per_level' in run.name]
	non_crashed = [run for run in single_level_runs if run.state in ['running','finished']]
	game_runs = [run for run in non_crashed if Config.ENVIRONMENT in run.name]
	wandb_save_dir = '%s/%s'%(Config.RESTORE_PATH,Config.ENVIRONMENT)
	print('Save dir: %s'%wandb_save_dir)
	if not os.path.isdir(wandb_save_dir):
		import requests
		for run in game_runs:
			level_id = run.name.split('__')[-1]
			run_save_dir = wandb_save_dir + '/' + level_id
			if not os.path.isdir(run_save_dir):
				os.makedirs(run_save_dir)

			def save_wandb_file(name):
				url = "https://api.wandb.ai/files/ssl_rl/procgen_generalization/%s/%s"%(run.id,name)
				r = requests.get(url)
				with open(run_save_dir+'/%s'%name , 'wb') as fh:
					fh.write(r.content)

			save_wandb_file('checkpoint')
			save_wandb_file('ppo-1.data-00000-of-00001')
			save_wandb_file('ppo-1.index')
			save_wandb_file('ppo-1.meta')

			print('Downloaded level id %s to %s (run id: %s)' % (level_id,run_save_dir,run.id) )
			print(os.listdir(run_save_dir))
			# wandb.restore(wandb_save_dir+"/checkpoint",run_path='/'.join(run.path))

	# load in just the graph and model parameters outside for-loop
	from coinrun import policies as policies_ppo
	ppo = policies_ppo.get_policy()
	ppo_graph_1, ppo_graph_2 = tf.Graph(), tf.Graph()

	PSE_policy = Config.PSE_POLICY

	if PSE_policy == 'ppo_2':
		levels = np.unique(os.listdir(wandb_save_dir)).astype(int)
		if Config.ENVIRONMENT == 'bigfish':
			levels = np.setdiff1d(levels,np.array([4]))

		pse_replay = []
		for mdp_id in levels:
			print('Collecting MDP %d'%mdp_id)
			mb_obs_i, mb_actions_i, mb_rewards_i = generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782)
			pse_replay.append([mb_obs_i, mb_actions_i, mb_rewards_i])

		
	for update in range(start_update+1, nupdates+1):
		assert nbatch % nminibatches == 0
		nbatch_train = nbatch // nminibatches
		tstart = time.time()
		frac = 1.0 - (update - 1.0) / nupdates
		lrnow = lr(frac)
		cliprangenow = cliprange(frac)

		# mpi_print('collecting rollouts...')
		run_tstart = time.time()

		packed = runner.run(update_frac=update/nupdates)
	
		obs, returns, masks, actions, values, neglogpacs, infos, rewards, epinfos, eval_epinfos = packed
		values_i = returns_i = states_nce = anchors_nce = labels_nce = actions_nce = neglogps_nce = rewards_nce = infos_nce = None

		"""
		PSE data re-collection

		1. Make 2 envs for respective policies for 2 random levels
		"""
		
		levels = np.unique(os.listdir(wandb_save_dir)).astype(int)
		if Config.ENVIRONMENT == 'bigfish':
			levels = np.setdiff1d(levels,np.array([4]))
		mdp_1,mdp_2 = np.random.choice(levels,size=2,replace=False)
		# import ipdb;ipdb.set_trace()
		observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
		action_space = DiscreteG(15)

		gym3_env_eval_1 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_1), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		venv_eval_1 = FakeEnv(gym3_env_eval_1, observation_space, action_space)
		venv_eval_1 = VecExtractDictObs(venv_eval_1, "rgb")
		venv_eval_1 = VecMonitor(
			venv=venv_eval_1, filename=None, keep_buf=100,
		)
		venv_eval_1 = VecNormalize(venv=venv_eval_1, ob=False)
		venv_eval_1 = wrappers.add_final_wrappers(venv_eval_1)

		gym3_env_eval_2 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_2), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		venv_eval_2 = FakeEnv(gym3_env_eval_2, observation_space, action_space)
		venv_eval_2 = VecExtractDictObs(venv_eval_2, "rgb")
		venv_eval_2 = VecMonitor(
			venv=venv_eval_2, filename=None, keep_buf=100,
		)
		venv_eval_2 = VecNormalize(venv=venv_eval_2, ob=False)
		venv_eval_2 = wrappers.add_final_wrappers(venv_eval_2)

		def random_policy(states):
			actions = np.random.randint(0,15,Config.NUM_ENVS)
			return actions

		# print('Loading weights from %s'%(wandb_save_dir+'/%d/ppo-1'%mdp_1))
		# with ppo_graph.as_default():
		#     ppo_model = ppo(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
		#import ipdb;ipdb.set_trace()
		# NOTE: this is recreating a graph within the updates, I'm moving them outside the training loop

		if PSE_policy == 'ppo':
			print('Using pretrained PPO policy')
			model1_path = wandb_save_dir+'/%d/ppo-1'%mdp_1
			model2_path = wandb_save_dir+'/%d/ppo-1'%mdp_2
			graph_one_vars = ppo_graph_1.get_all_collection_keys()

			with tf.compat.v1.Session(graph=ppo_graph_1,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1:
				with tf.compat.v1.variable_scope("model_1"):
					ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
					initialize = tf.compat.v1.global_variables_initializer()
					sess_1.run(initialize)
				model_saver = tf.train.import_meta_graph(model1_path+'.meta')
				model_saver.restore(sess_1, save_path=model1_path)
				mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval_1,nsteps=32, param_vals='pretrained')

			with tf.compat.v1.Session(graph=ppo_graph_2,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_2:
				with tf.compat.v1.variable_scope("model_2"):
					ppo_model_2 = ppo(sess_2, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
					initialize = tf.compat.v1.global_variables_initializer()
					sess_2.run(initialize)
				model_saver = tf.train.import_meta_graph(model2_path+'.meta')
				model_saver.restore(sess_2, save_path=model2_path)

				mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(ppo_model_2,venv_eval_2,nsteps=32, param_vals='pretrained')
		elif PSE_policy == 'random':
			print('Using random uniform policy')
			mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(random_policy,venv_eval_1,nsteps=32, param_vals='random')
			mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(random_policy,venv_eval_2,nsteps=32, param_vals='random')
		elif PSE_policy == 'ppo_2':
			mdp_1,mdp_2 = np.random.choice(np.arange(len(pse_replay)),size=2,replace=False)
			mb_obs_1, mb_actions_1, mb_rewards_1 = pse_replay[mdp_1]
			mb_obs_2, mb_actions_2, mb_rewards_2 = pse_replay[mdp_2]
		# reshape our augmented state vectors to match first dim of observation array
		# (mb_size*num_envs, 64*64*RGB)
		# (mb_size*num_envs, num_actions)
		avg_value = np.mean(values)
		epinfobuf10.extend(epinfos)
		epinfobuf100.extend(epinfos)
		eval_epinfobuf100.extend(eval_epinfos)

		run_elapsed = time.time() - run_tstart
		run_t_total += run_elapsed
		# mpi_print('rollouts complete')

		mblossvals = []

		# mpi_print('updating parameters...')
		train_tstart = time.time()

		mean_cust_loss = 0
		inds = np.arange(nbatch)
		inds_pse = np.arange(1024)
		inds_nce = np.arange(nbatch//runner.nce_update_freq)
		for _ in range(noptepochs):
			np.random.shuffle(inds)
			np.random.shuffle(inds_nce)
			for start in range(0, nbatch, nbatch_train):
				sess.run([model.train_model.train_dropout_assign_ops])
				end = start + nbatch_train
				mbinds = inds[start:end]

				
				slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards))

				slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1))
				slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2))
				
				mblossvals.append(model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='policy'))

				slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards))

			np.random.shuffle(inds_pse)
			slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1))
			slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2))
            
			model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='pse')
		# update the dropout mask
		sess.run([model.train_model.train_dropout_assign_ops])
		sess.run([model.train_model.run_dropout_assign_ops])

		train_elapsed = time.time() - train_tstart
		train_t_total += train_elapsed
		# mpi_print('update complete')

		lossvals = np.mean(mblossvals, axis=0)
		tnow = time.time()
		fps = int(nbatch / (tnow - tstart))

		if update % log_interval == 0 or update == 1:
			step = update*nbatch
			eval_rew_mean = utils.process_ep_buf(eval_active_ep_buf, tb_writer=tb_writer, suffix='_eval', step=step)
			rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step)
			
			ep_len_mean = np.nanmean([epinfo['l'] for epinfo in active_ep_buf])
			
			mpi_print('\n----', update)

			mean_rewards.append(rew_mean_10)
			datapoints.append([step, rew_mean_10])
			tb_writer.log_scalar(ep_len_mean, 'ep_len_mean', step=step)
			tb_writer.log_scalar(fps, 'fps', step=step)
			tb_writer.log_scalar(avg_value, 'avg_value', step=step)
			tb_writer.log_scalar(mean_cust_loss, 'custom_loss', step=step)


			mpi_print('time_elapsed', tnow - tfirststart, run_t_total, train_t_total)
			mpi_print('timesteps', update*nsteps, total_timesteps)

			# eval_rew_mean = episode_rollouts(eval_env,model,step,tb_writer)

			mpi_print('eplenmean', ep_len_mean)
			mpi_print('eprew', rew_mean_10)
			mpi_print('eprew_eval', eval_rew_mean)
			mpi_print('fps', fps)
			mpi_print('total_timesteps', update*nbatch)
			mpi_print([epinfo['r'] for epinfo in epinfobuf10])

			rep_loss = 0
			if len(mblossvals):
				for (lossval, lossname) in zip(lossvals, model.loss_names):
					mpi_print(lossname, lossval)
					tb_writer.log_scalar(lossval, lossname, step=step)
			mpi_print('----\n')

			wandb.log({"%s/eprew"%(Config.ENVIRONMENT):rew_mean_10,
						"%s/eprew_eval"%(Config.ENVIRONMENT):eval_rew_mean,
						"%s/custom_step"%(Config.ENVIRONMENT):step})
		if can_save:
			if save_interval and (update % save_interval == 0):
				save_model()

			for j, checkpoint in enumerate(checkpoints):
				if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)):
					saved_key_checkpoints[j] = True
					save_model(str(checkpoint) + 'M')

	save_model()

	env.close()
	# import subprocess
	# wandb_files = os.listdir('wandb')
	# file_to_save = ''
	# for fn in wandb_files:
	# 	if str(run_id) in fn:
	# 		file_to_save = fn
	# 		break
	# print(file_to_save)
	# my_env = os.environ.copy()
	# my_env["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1"
	# subprocess.call(['wandb','sync','wandb/'+ file_to_save],env=my_env)
	return mean_rewards
Ejemplo n.º 23
0
def create_env(
    num_envs,
    *,
    env_kind="procgen",
    epsilon_greedy=0.0,
    reward_scale=1.0,
    frame_stack=1,
    use_sticky_actions=0,
    coinrun_old_extra_actions=0,
    **kwargs,
):
    if env_kind == "procgen":
        env_kwargs = {k: v for k, v in kwargs.items() if v is not None}
        env_name = env_kwargs.pop("env_name")

        if env_name == "coinrun_old":
            import coinrun
            from coinrun.config import Config

            Config.initialize_args(use_cmd_line_args=False, **env_kwargs)
            global coinrun_initialized
            if not coinrun_initialized:
                coinrun.init_args_and_threads()
                coinrun_initialized = True
            venv = coinrun.make("standard", num_envs)
            if coinrun_old_extra_actions > 0:
                venv = VecExtraActions(
                    venv, extra_actions=coinrun_old_extra_actions, default_action=0
                )

        else:
            from procgen import ProcgenGym3Env
            import gym3

            env_kwargs = {
                k: v for k, v in env_kwargs.items() if k in PROCGEN_KWARG_KEYS
            }
            env = ProcgenGym3Env(num_envs, env_name=env_name, **env_kwargs)
            env = gym3.ExtractDictObWrapper(env, "rgb")
            venv = gym3.ToBaselinesVecEnv(env)

    elif env_kind == "atari":
        game_version = "v0" if use_sticky_actions == 1 else "v4"

        def make_atari_env(lower_env_id, num_env):
            env_id = ATARI_ENV_DICT[lower_env_id] + f"NoFrameskip-{game_version}"

            def make_atari_env_fn():
                env = make_atari(env_id)
                env = wrap_deepmind(env, frame_stack=False, clip_rewards=False)

                return env

            return SubprocVecEnv([make_atari_env_fn for i in range(num_env)])

        lower_env_id = kwargs["env_id"]

        venv = make_atari_env(lower_env_id, num_envs)

    else:
        raise ValueError(f"Unsupported env_kind: {env_kind}")

    if frame_stack > 1:
        venv = VecFrameStack(venv=venv, nstack=frame_stack)

    if reward_scale != 1:
        venv = VecRewardScale(venv, reward_scale)

    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    if epsilon_greedy > 0:
        venv = EpsilonGreedy(venv, epsilon_greedy)

    venv = VecShallowCopy(venv)

    return venv