コード例 #1
0
ファイル: ppo2_pse.py プロジェクト: ahmeda14960/IBAC-SNI
def generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782):
	ppo_graph = tf.Graph()
	print('Created graph')
	observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
	action_space = DiscreteG(15)

	gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_id), 
									 paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
	venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space)
	venv_eval = VecExtractDictObs(venv_eval, "rgb")
	venv_eval = VecMonitor(
		venv=venv_eval, filename=None, keep_buf=100,
	)
	venv_eval = VecNormalize(venv=venv_eval, ob=False)
	venv_eval = wrappers.add_final_wrappers(venv_eval)
	print('Created env')
	graph_one_vars = ppo_graph.get_all_collection_keys()

	model_path = wandb_save_dir+'/%d/ppo-1'%mdp_id

	with tf.compat.v1.Session(graph=ppo_graph,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1:
		with tf.compat.v1.variable_scope("model_%d"%np.random.randint(0,100000,1).item()):
			ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
			initialize = tf.compat.v1.global_variables_initializer()
			sess_1.run(initialize)
			print('Inited session')
		model_saver = tf.train.import_meta_graph(model_path+'.meta')
		model_saver.restore(sess_1, save_path=model_path)
		print('Restored PPO')
		mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval,nsteps=nsteps_rollout, param_vals='pretrained')
		print('Collected level data')

	venv_eval.close()

	return mb_obs_1, mb_actions_1, mb_rewards_1
コード例 #2
0
def make_env(steps_per_env):
	observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
	action_space = DiscreteG(15)
	if Config.FIRST_PHASE == 'exploration':
		# baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL)
	else:
		# baseline_vec_train = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		gym3_env_train = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE, start_level=Config.START_LEVEL)
	if Config.SECOND_PHASE == 'exploration':
		# baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT,  paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE)
		gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT,  paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL)
	elif Config.SECOND_PHASE != "None":
		# baseline_vec_adapt = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE)
		gym3_env_adapt = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=Config.NUM_LEVELS, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.SECOND_PHASE, start_level=Config.START_LEVEL)
	else:
		baseline_vec_adapt = gym3_env_adapt = None
	
	venv_train = FakeEnv(gym3_env_train, observation_space, action_space)
	venv_train = VecExtractDictObs(venv_train, "rgb")
	if Config.SECOND_PHASE != "None":
		venv_adapt = FakeEnv(gym3_env_adapt, observation_space, action_space)   
		venv_adapt = VecExtractDictObs(venv_adapt, "rgb")
	venv_train = VecMonitor(
		venv=venv_train, filename=None, keep_buf=100,
	)
	if Config.SECOND_PHASE != "None":
		venv_adapt = VecMonitor(
			venv=venv_adapt, filename=None, keep_buf=100,
		)

	venv_train = VecNormalize(venv=venv_train, ob=False)
	venv_train = wrappers.add_final_wrappers(venv_train)
	if Config.SECOND_PHASE != "None":
		venv_adapt = VecNormalize(venv=venv_adapt, ob=False)
		venv_adapt = wrappers.add_final_wrappers(venv_adapt)

		venv = wrappers.DistributionShiftWrapperVec(env_list=[venv_train, venv_adapt], steps_per_env=steps_per_env) 
	else:
		venv = venv_train
		venv_adapt = venv_train = None
		venv.current_env_steps_left = steps_per_env

	return venv, venv_train, venv_adapt
コード例 #3
0
def main():
    args = setup_utils.setup_and_load()
    setup_utils.load_for_setup_if_necessary()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes=" baseline train",
               tags=["baseline", Config.RUN_ID.split('-')[0]],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()
    utils.mpi_print('Set up gpu')
    utils.mpi_print(args)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs is how many envs run parallel on a cpu
    # VenEnv class allows parallel rollout
    nenvs = Config.NUM_ENVS
    total_timesteps = int(256 * 10**6)

    env = utils.make_general_env(nenvs, seed=rank)
    utils.mpi_print('Set up env')

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = policies_back.get_policy()
        #policy = policies.get_policy()
        utils.mpi_print('Set up policy')

        learn_func(policy=policy,
                   env=env,
                   log_interval=args.log_interval,
                   save_interval=args.save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=Config.GAE_LAMBDA,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   ent_coef=Config.ENTROPY_COEFF,
                   vf_coef=Config.VF_COEFF,
                   max_grad_norm=Config.MAX_GRAD_NORM,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * Config.CLIP_RANGE,
                   total_timesteps=total_timesteps)
コード例 #4
0
def main():
    args = setup_utils.setup_and_load(num_levels=250,
                                      starting_level=0,
                                      paint_vel_info=1,
                                      run_id='start0numlev250_256mts_dann_low',
                                      num_envs=32)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    #config = tf.ConfigProto()
    frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options)
    nogpu_config = tf.ConfigProto(device_count={'GPU': 0})
    #config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    print("Num envs: " + str(Config.NUM_ENVS))
    total_timesteps = int(256e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=frac_gpu_config):
        #with tf.Session(config=nogpu_config):
        env = wrappers.add_final_wrappers(env)

        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                   env=env,
                   save_interval=save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=0.95,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   log_interval=1,
                   ent_coef=Config.ENTROPY_COEFF,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * 0.2,
                   total_timesteps=total_timesteps)
コード例 #5
0
ファイル: nr_train_agent.py プロジェクト: Desein-Yang/GARL
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes="network randomization",
               tags=["baseline"],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(256e6)

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = nr_policies.get_policy()

        nr_ppo2.learn(policy=policy,
                      env=env,
                      save_interval=args.save_interval,
                      nsteps=Config.NUM_STEPS,
                      nminibatches=Config.NUM_MINIBATCHES,
                      lam=0.95,
                      gamma=Config.GAMMA,
                      noptepochs=Config.PPO_EPOCHS,
                      log_interval=1,
                      ent_coef=Config.ENTROPY_COEFF,
                      lr=lambda f: f * Config.LEARNING_RATE,
                      cliprange=lambda f: f * 0.2,
                      total_timesteps=total_timesteps)
コード例 #6
0
    def create_saliency(model_idx, sess):
        graph = tf.get_default_graph()
        env = utils.make_general_env(1)
        env = wrappers.add_final_wrappers(env)
        agent = create_act_model(sess, env, 1)
        action_selector = tf.placeholder(tf.int32)
        gradient_saliency = saliency.GradientSaliency(graph, sess, agent.pd.logits[0][action_selector], agent.X)
        sess.run(tf.compat.v1.global_variables_initializer())

        # setup_utils.restore_file(models[model_idx])
        try:
            loaded_params = utils.load_params_for_scope(sess, 'model')
            if not loaded_params:
                print('NO SAVED PARAMS LOADED')
        except AssertionError as e:
            models[model_idx] = None
            return [None]*3
        return agent, gradient_saliency, action_selector
コード例 #7
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(200e6)
    elif Config.SHORT_TRAINING:
        total_timesteps = int(120e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        
        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                    env=env,
                    save_interval=save_interval,
                    nsteps=Config.NUM_STEPS,
                    nminibatches=Config.NUM_MINIBATCHES,
                    lam=0.95,
                    gamma=Config.GAMMA,
                    noptepochs=Config.PPO_EPOCHS,
                    log_interval=1,
                    ent_coef=Config.ENTROPY_COEFF,
                    lr=lambda f : f * Config.LEARNING_RATE,
                    cliprange=lambda f : f * 0.2,
                    total_timesteps=total_timesteps)
コード例 #8
0
ファイル: test_agent.py プロジェクト: Desein-Yang/GARL
def test(sess, load_path, env, should_render=False, rep_count=Config.REP):
    rank = MPI.COMM_WORLD.Get_rank()
    size = MPI.COMM_WORLD.Get_size()

    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    if should_eval:
        #env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    model = load_model(sess, filename)

    agent = create_act_model(sess, env, nenvs)

    sess.run(tf.global_variables_initializer())
    loaded_params = utils.load_params_for_scope(sess, 'model')

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    while should_continue():
        action, values, state, _ = agent.step(obs, state, done)
        obs, rew, done, info = env.step(action)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        if t_step % 100 == 0:
            mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0],
                      np.shape(obs))

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = {
        'steps_elapsed': steps_elapsed,
    }

    if should_eval:
        testset_size = rep_count * nenvs
        mean_score = np.sum(scores) / testset_size
        succ_rate = np.sum(scores == 10.0) / testset_size
        max_idx = np.argmax(scores)
        mpi_print('max idx', max_idx)
        mpi_print('steps_elapsed', steps_elapsed)
        if size > 1:
            mean_score = utils.mpi_average([mean_score])
        mpi_print('mpi_mean', mpi_mean_score)
        wandb.log({'Test_Rew_mean': mean_score, 'Test_Succ_rate': succ_rate})
        result['scores'] = scores
        result['testset_size'] = testset_size
        result['test_rew_mean'] = mean_score
        result['test_succ_rate'] = succ_rate

    return result
コード例 #9
0
def enjoy_env_sess(sess):
    should_render = True
    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    rep_count = Config.REP

    if should_eval:
        env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    agent = create_act_model(sess, env, nenvs)

    sess.run(tf.global_variables_initializer())
    loaded_params = utils.load_params_for_scope(sess, 'model')

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    while should_continue():
        action, values, state, _ = agent.step(obs, state, done)
        obs, rew, done, info = env.step(action)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        if t_step % 100 == 0:
            mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0],
                      np.shape(obs))

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = 0

    if should_eval:
        mean_score = np.mean(scores) / rep_count
        max_idx = np.argmax(scores)
        mpi_print('scores', scores / rep_count)
        print('mean_score', mean_score)
        mpi_print('max idx', max_idx)

        mpi_mean_score = utils.mpi_average([mean_score])
        mpi_print('mpi_mean', mpi_mean_score)

        result = mean_score

    return result
コード例 #10
0
def enjoy_env_sess(sess, DIR_NAME):
    should_render = True
    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    rep_count = Config.REP

    file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID)
    f_io = open(file_name, 'a')

    if should_eval:
        if Config.TEST_NUM_EVAL > -1:
            env = utils.make_general_env(Config.TEST_NUM_EVAL)
        else:
            env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    vae = ConvVAE(z_size=Config.VAE_Z_SIZE,
                  batch_size=nenvs,
                  is_training=False,
                  reuse=False,
                  gpu_mode=True,
                  use_coord_conv=True)
    agent = create_act_model(sess, env, nenvs, Config.VAE_Z_SIZE)
    num_actions = env.action_space.n

    init_rand = tf.variables_initializer(
        [v for v in tf.global_variables() if 'randcnn' in v.name])
    sess.run(tf.compat.v1.global_variables_initializer())

    soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions],
                                name='soft_numpy')
    dist = tfp.distributions.Categorical(probs=soft_numpy)
    sampled_action = dist.sample()

    loaded_params = utils.load_params_for_scope(sess, 'model')
    vae.load_json_full(Config.VAE_PATH)

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    actions = [env.action_space.sample() for _ in range(nenvs)]
    actions = np.array(actions)
    obs, _, _, _ = env.step(actions)

    sess.run(init_rand)
    while should_continue():

        #scipy.misc.imsave('raw_inputs.png', obs[0])
        encoder_in = obs.astype(np.float32) / 255.0
        batch_z = vae.encode(encoder_in)
        #reconstruct = vae.decode(batch_z)
        #scipy.misc.imsave('recon.png', reconstruct[0])

        action, values, state, _ = agent.step(batch_z, state, done)
        obs, rew, done, info = env.step(action)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = 0

    if should_eval:
        mean_score = np.mean(scores) / rep_count
        max_idx = np.argmax(scores)

        result = mean_score

        f_io.write("{}\n".format(result))
        f_io.close()

    return result
コード例 #11
0
def main(sess):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000

    if Config.EXTRACT_SEED != -1:
        seed = Config.EXTRACT_SEED
    if Config.EXTRACT_RANK != -1:
        rank = Config.EXTRACT_RANK

    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    use_policy = (Config.RESTORE_ID != '')

    nenvs = Config.NUM_ENVS
    total_timesteps = int(502e6)
    env = utils.make_general_env(nenvs, seed=rank)

    if use_policy:
        agent = create_act_model(sess, env, nenvs)
        sess.run(tf.compat.v1.global_variables_initializer())
        loaded_params = utils.load_params_for_scope(sess, 'model')
        if not loaded_params:
            print('NO SAVED PARAMS LOADED')

    # make directory
    DIR_NAME = './VAE/records/'
    if not os.path.exists(DIR_NAME):
        os.makedirs(DIR_NAME, exist_ok=True)
    
    # set file name
    filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz"
    
    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
        obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
        obs[:] = env.reset()
        dones = [False for _ in range(nenv)]
        
        # remove noisy inputs
        actions = [env.action_space.sample() for _ in range(nenv)]
        actions = np.array(actions)
        obs[:], rewards, dones, _ = env.step(actions)
        state = agent.initial_state
        
        mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[]
        # For n in range number of steps
        for _ in range(400):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            if use_policy:
                actions, _, _, _ = agent.step(obs, state, dones)
            else:
                actions = [env.action_space.sample() for _ in range(nenv)]
            actions = np.array(actions)
            mb_obs.append(obs.copy())
            mb_actions.append(actions)
            mb_dones.append(dones)
            
            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            obs[:], rewards, dones, _ = env.step(actions)
            mb_next_obs.append(obs.copy())
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=obs.dtype)
        mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        
        #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones)
        np.savez_compressed(filename, obs=mb_obs)
        return filename
コード例 #12
0
ファイル: ppo2_pse.py プロジェクト: ahmeda14960/IBAC-SNI
def learn(*, policy, env, eval_env, nsteps, total_timesteps, ent_coef, lr,
			 vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
			log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
			save_interval=0, load_path=None):
	comm = MPI.COMM_WORLD
	rank = comm.Get_rank()
	mpi_size = comm.Get_size()

	#tf.compat.v1.disable_v2_behavior()
	sess = tf.compat.v1.get_default_session()

	if isinstance(lr, float): lr = constfn(lr)
	else: assert callable(lr)
	if isinstance(cliprange, float): cliprange = constfn(cliprange)
	else: assert callable(cliprange)
	total_timesteps = int(total_timesteps)
	
	nenvs = env.num_envs
	ob_space = env.observation_space
	ac_space = env.action_space
	nbatch = nenvs * nsteps
	
	nbatch_train = nbatch // nminibatches
	model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
					nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
					max_grad_norm=max_grad_norm)
	utils.load_all_params(sess)

	runner = Runner(env=env, eval_env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

	epinfobuf10 = deque(maxlen=10)
	epinfobuf100 = deque(maxlen=100)
	eval_epinfobuf100 = deque(maxlen=100)
	tfirststart = time.time()
	active_ep_buf = epinfobuf100
	eval_active_ep_buf = eval_epinfobuf100

	nupdates = total_timesteps//nbatch
	mean_rewards = []
	datapoints = []

	run_t_total = 0
	train_t_total = 0

	can_save = False
	checkpoints = [32, 64]
	saved_key_checkpoints = [False] * len(checkpoints)

	if Config.SYNC_FROM_ROOT and rank != 0:
		can_save = False

	def save_model(base_name=None):
		base_dict = {'datapoints': datapoints}
		utils.save_params_in_scopes(sess, ['model'], Config.get_save_file(base_name=base_name), base_dict)

	# For logging purposes, allow restoring of update
	start_update = 0
	if Config.RESTORE_STEP is not None:
		start_update = Config.RESTORE_STEP // nbatch

	z_iter = 0
	curr_z = np.random.randint(0, high=Config.POLICY_NHEADS)
	tb_writer = TB_Writer(sess)
	import os
	os.environ["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1"
	os.environ["WANDB_SILENT"] = "true"
	run_id = np.random.randint(100000000)
	os.environ["WANDB_RUN_ID"] = str(run_id)
	group_name = "%s__%s__%f__%f" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP)
	name = "%s__%s__%f__%f__%d" %(Config.ENVIRONMENT,Config.RUN_ID,Config.REP_LOSS_WEIGHT, Config.TEMP, run_id)
	wandb.init(project='ising_generalization' if Config.ENVIRONMENT == 'ising' else 'procgen_generalization' ,
			  entity='ssl_rl', config=Config.args_dict,
			  group=group_name, name=name,
			  mode="disabled" if Config.DISABLE_WANDB else "online")

	api = wandb.Api()
	list_runs = api.runs("ssl_rl/procgen_generalization")
	single_level_runs=[run for run in list_runs if 'ppo_per_level' in run.name]
	non_crashed = [run for run in single_level_runs if run.state in ['running','finished']]
	game_runs = [run for run in non_crashed if Config.ENVIRONMENT in run.name]
	wandb_save_dir = '%s/%s'%(Config.RESTORE_PATH,Config.ENVIRONMENT)
	print('Save dir: %s'%wandb_save_dir)
	if not os.path.isdir(wandb_save_dir):
		import requests
		for run in game_runs:
			level_id = run.name.split('__')[-1]
			run_save_dir = wandb_save_dir + '/' + level_id
			if not os.path.isdir(run_save_dir):
				os.makedirs(run_save_dir)

			def save_wandb_file(name):
				url = "https://api.wandb.ai/files/ssl_rl/procgen_generalization/%s/%s"%(run.id,name)
				r = requests.get(url)
				with open(run_save_dir+'/%s'%name , 'wb') as fh:
					fh.write(r.content)

			save_wandb_file('checkpoint')
			save_wandb_file('ppo-1.data-00000-of-00001')
			save_wandb_file('ppo-1.index')
			save_wandb_file('ppo-1.meta')

			print('Downloaded level id %s to %s (run id: %s)' % (level_id,run_save_dir,run.id) )
			print(os.listdir(run_save_dir))
			# wandb.restore(wandb_save_dir+"/checkpoint",run_path='/'.join(run.path))

	# load in just the graph and model parameters outside for-loop
	from coinrun import policies as policies_ppo
	ppo = policies_ppo.get_policy()
	ppo_graph_1, ppo_graph_2 = tf.Graph(), tf.Graph()

	PSE_policy = Config.PSE_POLICY

	if PSE_policy == 'ppo_2':
		levels = np.unique(os.listdir(wandb_save_dir)).astype(int)
		if Config.ENVIRONMENT == 'bigfish':
			levels = np.setdiff1d(levels,np.array([4]))

		pse_replay = []
		for mdp_id in levels:
			print('Collecting MDP %d'%mdp_id)
			mb_obs_i, mb_actions_i, mb_rewards_i = generate_level_replay(ppo,mdp_id,wandb_save_dir,nbatch_train, nsteps, max_grad_norm, ob_space, ac_space, nsteps_rollout=782)
			pse_replay.append([mb_obs_i, mb_actions_i, mb_rewards_i])

		
	for update in range(start_update+1, nupdates+1):
		assert nbatch % nminibatches == 0
		nbatch_train = nbatch // nminibatches
		tstart = time.time()
		frac = 1.0 - (update - 1.0) / nupdates
		lrnow = lr(frac)
		cliprangenow = cliprange(frac)

		# mpi_print('collecting rollouts...')
		run_tstart = time.time()

		packed = runner.run(update_frac=update/nupdates)
	
		obs, returns, masks, actions, values, neglogpacs, infos, rewards, epinfos, eval_epinfos = packed
		values_i = returns_i = states_nce = anchors_nce = labels_nce = actions_nce = neglogps_nce = rewards_nce = infos_nce = None

		"""
		PSE data re-collection

		1. Make 2 envs for respective policies for 2 random levels
		"""
		
		levels = np.unique(os.listdir(wandb_save_dir)).astype(int)
		if Config.ENVIRONMENT == 'bigfish':
			levels = np.setdiff1d(levels,np.array([4]))
		mdp_1,mdp_2 = np.random.choice(levels,size=2,replace=False)
		# import ipdb;ipdb.set_trace()
		observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
		action_space = DiscreteG(15)

		gym3_env_eval_1 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_1), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		venv_eval_1 = FakeEnv(gym3_env_eval_1, observation_space, action_space)
		venv_eval_1 = VecExtractDictObs(venv_eval_1, "rgb")
		venv_eval_1 = VecMonitor(
			venv=venv_eval_1, filename=None, keep_buf=100,
		)
		venv_eval_1 = VecNormalize(venv=venv_eval_1, ob=False)
		venv_eval_1 = wrappers.add_final_wrappers(venv_eval_1)

		gym3_env_eval_2 = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=1, start_level=int(mdp_2), paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
		venv_eval_2 = FakeEnv(gym3_env_eval_2, observation_space, action_space)
		venv_eval_2 = VecExtractDictObs(venv_eval_2, "rgb")
		venv_eval_2 = VecMonitor(
			venv=venv_eval_2, filename=None, keep_buf=100,
		)
		venv_eval_2 = VecNormalize(venv=venv_eval_2, ob=False)
		venv_eval_2 = wrappers.add_final_wrappers(venv_eval_2)

		def random_policy(states):
			actions = np.random.randint(0,15,Config.NUM_ENVS)
			return actions

		# print('Loading weights from %s'%(wandb_save_dir+'/%d/ppo-1'%mdp_1))
		# with ppo_graph.as_default():
		#     ppo_model = ppo(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
		#import ipdb;ipdb.set_trace()
		# NOTE: this is recreating a graph within the updates, I'm moving them outside the training loop

		if PSE_policy == 'ppo':
			print('Using pretrained PPO policy')
			model1_path = wandb_save_dir+'/%d/ppo-1'%mdp_1
			model2_path = wandb_save_dir+'/%d/ppo-1'%mdp_2
			graph_one_vars = ppo_graph_1.get_all_collection_keys()

			with tf.compat.v1.Session(graph=ppo_graph_1,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_1:
				with tf.compat.v1.variable_scope("model_1"):
					ppo_model_1 = ppo(sess_1, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
					initialize = tf.compat.v1.global_variables_initializer()
					sess_1.run(initialize)
				model_saver = tf.train.import_meta_graph(model1_path+'.meta')
				model_saver.restore(sess_1, save_path=model1_path)
				mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(ppo_model_1,venv_eval_1,nsteps=32, param_vals='pretrained')

			with tf.compat.v1.Session(graph=ppo_graph_2,config=tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1)) as sess_2:
				with tf.compat.v1.variable_scope("model_2"):
					ppo_model_2 = ppo(sess_2, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm, override_agent='ppo')
					initialize = tf.compat.v1.global_variables_initializer()
					sess_2.run(initialize)
				model_saver = tf.train.import_meta_graph(model2_path+'.meta')
				model_saver.restore(sess_2, save_path=model2_path)

				mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(ppo_model_2,venv_eval_2,nsteps=32, param_vals='pretrained')
		elif PSE_policy == 'random':
			print('Using random uniform policy')
			mb_obs_1, mb_actions_1, mb_rewards_1 = collect_data(random_policy,venv_eval_1,nsteps=32, param_vals='random')
			mb_obs_2, mb_actions_2, mb_rewards_2 = collect_data(random_policy,venv_eval_2,nsteps=32, param_vals='random')
		elif PSE_policy == 'ppo_2':
			mdp_1,mdp_2 = np.random.choice(np.arange(len(pse_replay)),size=2,replace=False)
			mb_obs_1, mb_actions_1, mb_rewards_1 = pse_replay[mdp_1]
			mb_obs_2, mb_actions_2, mb_rewards_2 = pse_replay[mdp_2]
		# reshape our augmented state vectors to match first dim of observation array
		# (mb_size*num_envs, 64*64*RGB)
		# (mb_size*num_envs, num_actions)
		avg_value = np.mean(values)
		epinfobuf10.extend(epinfos)
		epinfobuf100.extend(epinfos)
		eval_epinfobuf100.extend(eval_epinfos)

		run_elapsed = time.time() - run_tstart
		run_t_total += run_elapsed
		# mpi_print('rollouts complete')

		mblossvals = []

		# mpi_print('updating parameters...')
		train_tstart = time.time()

		mean_cust_loss = 0
		inds = np.arange(nbatch)
		inds_pse = np.arange(1024)
		inds_nce = np.arange(nbatch//runner.nce_update_freq)
		for _ in range(noptepochs):
			np.random.shuffle(inds)
			np.random.shuffle(inds_nce)
			for start in range(0, nbatch, nbatch_train):
				sess.run([model.train_model.train_dropout_assign_ops])
				end = start + nbatch_train
				mbinds = inds[start:end]

				
				slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards))

				slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1))
				slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2))
				
				mblossvals.append(model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='policy'))

				slices = (arr[mbinds] for arr in (obs, returns, masks, actions, infos, values, neglogpacs, rewards))

			np.random.shuffle(inds_pse)
			slices_pse_1 = (arr[inds_pse] for arr in (mb_obs_1, mb_actions_1, mb_rewards_1))
			slices_pse_2 = (arr[inds_pse] for arr in (mb_obs_2, mb_actions_2, mb_rewards_2))
            
			model.train(lrnow, cliprangenow, *slices, *slices_pse_1, *slices_pse_2, train_target='pse')
		# update the dropout mask
		sess.run([model.train_model.train_dropout_assign_ops])
		sess.run([model.train_model.run_dropout_assign_ops])

		train_elapsed = time.time() - train_tstart
		train_t_total += train_elapsed
		# mpi_print('update complete')

		lossvals = np.mean(mblossvals, axis=0)
		tnow = time.time()
		fps = int(nbatch / (tnow - tstart))

		if update % log_interval == 0 or update == 1:
			step = update*nbatch
			eval_rew_mean = utils.process_ep_buf(eval_active_ep_buf, tb_writer=tb_writer, suffix='_eval', step=step)
			rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step)
			
			ep_len_mean = np.nanmean([epinfo['l'] for epinfo in active_ep_buf])
			
			mpi_print('\n----', update)

			mean_rewards.append(rew_mean_10)
			datapoints.append([step, rew_mean_10])
			tb_writer.log_scalar(ep_len_mean, 'ep_len_mean', step=step)
			tb_writer.log_scalar(fps, 'fps', step=step)
			tb_writer.log_scalar(avg_value, 'avg_value', step=step)
			tb_writer.log_scalar(mean_cust_loss, 'custom_loss', step=step)


			mpi_print('time_elapsed', tnow - tfirststart, run_t_total, train_t_total)
			mpi_print('timesteps', update*nsteps, total_timesteps)

			# eval_rew_mean = episode_rollouts(eval_env,model,step,tb_writer)

			mpi_print('eplenmean', ep_len_mean)
			mpi_print('eprew', rew_mean_10)
			mpi_print('eprew_eval', eval_rew_mean)
			mpi_print('fps', fps)
			mpi_print('total_timesteps', update*nbatch)
			mpi_print([epinfo['r'] for epinfo in epinfobuf10])

			rep_loss = 0
			if len(mblossvals):
				for (lossval, lossname) in zip(lossvals, model.loss_names):
					mpi_print(lossname, lossval)
					tb_writer.log_scalar(lossval, lossname, step=step)
			mpi_print('----\n')

			wandb.log({"%s/eprew"%(Config.ENVIRONMENT):rew_mean_10,
						"%s/eprew_eval"%(Config.ENVIRONMENT):eval_rew_mean,
						"%s/custom_step"%(Config.ENVIRONMENT):step})
		if can_save:
			if save_interval and (update % save_interval == 0):
				save_model()

			for j, checkpoint in enumerate(checkpoints):
				if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)):
					saved_key_checkpoints[j] = True
					save_model(str(checkpoint) + 'M')

	save_model()

	env.close()
	# import subprocess
	# wandb_files = os.listdir('wandb')
	# file_to_save = ''
	# for fn in wandb_files:
	# 	if str(run_id) in fn:
	# 		file_to_save = fn
	# 		break
	# print(file_to_save)
	# my_env = os.environ.copy()
	# my_env["WANDB_API_KEY"] = "02e3820b69de1b1fcc645edcfc3dd5c5079839a1"
	# subprocess.call(['wandb','sync','wandb/'+ file_to_save],env=my_env)
	return mean_rewards
コード例 #13
0
def main():
    print('Parsing args')
    args = setup_utils.setup_and_load()
    print('Setting up MPI')
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)
    print('Setting config')
    # coinrun version, allows you to specify how many GPUs you want this run to use
    #utils.setup_mpi_gpus()

    # baselines version, just sets the number of GPUs to the -n flag 
    #setup_mpi_gpus()
    os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS)
    
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101
    
    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(25e6)
    elif Config.SHORT_TRAINING:
        total_timesteps = int(8e6)
    elif Config.VERY_SHORT_TRAINING:
        total_timesteps = int(500e3)
    elif Config.VERY_VERY_SHORT_TRAINING:
        total_timesteps = int(50e3)
    save_interval = args.save_interval

    #env = utils.make_general_env(nenvs, seed=rank)
    #print (env)

    mpi_print(Config.ENVIRONMENT)
    venv, venv_train, venv_adapt = make_env(total_timesteps//2) #switch "easy" -> "exploration" halfway
    # import ipdb;ipdb.set_trace()
    observation_space = Dict(rgb=Box(shape=(64,64,3),low=0,high=255))
    action_space = DiscreteG(15)
    
    # baseline_vec_eval = ProcgenEnv(num_envs=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)
    gym3_env_eval = ProcgenGym3Env(num=Config.NUM_ENVS, env_name=Config.ENVIRONMENT, num_levels=0, paint_vel_info=Config.PAINT_VEL_INFO, distribution_mode=Config.FIRST_PHASE)

    venv_eval = FakeEnv(gym3_env_eval, observation_space, action_space)
    venv_eval = VecExtractDictObs(venv_eval, "rgb")
    venv_eval = VecMonitor(
        venv=venv_eval, filename=None, keep_buf=100,
    )
    venv_eval = VecNormalize(venv=venv_eval, ob=False)
    venv_eval = wrappers.add_final_wrappers(venv_eval)

    
    with tf.compat.v1.Session(config=config) as sess:
        
        if Config.AGENT == 'ppo':
            from coinrun import ppo2 as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_rnd':
            from coinrun import ppo2_rnd as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_diayn':
            from coinrun import ppo2_diayn as agent
            from coinrun import policies
        elif Config.AGENT == 'ppg':
            from coinrun import ppo2_ppg as agent
            from coinrun import policies
        elif Config.AGENT == 'ppg_ssl':
            from coinrun import ppo2_ppg_ssl as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_goal':
            from coinrun import ppo2_goal as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_curl':
            from coinrun import ppo2_curl as agent
            from coinrun import policies
        elif Config.AGENT == 'ppo_goal_bogdan' or Config.AGENT == 'ppo_ctrl':
            from coinrun import ppo2_goal_bogdan as agent
            from coinrun import policies_bogdan as policies
        elif Config.AGENT == 'ppg_cluster':
            from coinrun import ppo2_ppg_sinkhorn as agent
            from coinrun import policies_ppg_sinkhorn as policies
        elif Config.AGENT == 'ppo_bisimulation':
            from coinrun import ppo2_bisimulation as agent
            from coinrun import policies_bisimulation as policies
        elif Config.AGENT == 'ppo_pse':
            from coinrun import ppo2_pse as agent
            from coinrun import policies_pse as policies
        policy = policies.get_policy()

        final_eprew_eval = agent.learn(policy=policy,
                    env=venv,
                    eval_env=venv_eval,
                    save_interval=save_interval,
                    nsteps=Config.NUM_STEPS,
                    nminibatches=Config.NUM_MINIBATCHES,
                    lam=0.95,
                    gamma=Config.GAMMA,
                    noptepochs=Config.PPO_EPOCHS,
                    log_interval=1, #10,
                    ent_coef=Config.ENTROPY_COEFF,
                    lr=lambda f : f * Config.LEARNING_RATE,
					lr_ctrl=lambda f : f * Config.LEARNING_RATE_CTRL,
					lr_myow=lambda f : f * Config.LEARNING_RATE_MYOW,
                    cliprange=lambda f : f * 0.2,
                    total_timesteps=total_timesteps)

        return final_eprew_eval
コード例 #14
0
 def make_env():
     env = utils.make_general_env(nenvs, seed=rank)
     env = wrappers.add_final_wrappers(env)
     return env
コード例 #15
0
def enjoy_env_sess(sess, checkpoint, overlap):
    #base_name = str(8*checkpoint)  + 'M'
    #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name)
    should_eval = True
    mpi_print('test levels seed', Config.SET_SEED)
    mpi_print('test levels ', Config.NUM_LEVELS)
    rep_count = 50

    env = utils.make_general_env(20)
    env = wrappers.add_final_wrappers(env)
    nenvs = env.num_envs

    sess.run(tf.global_variables_initializer())
    args_now = Config.get_args_dict()
    #args_run = utils.load_args()
    agent = create_act_model(sess, env, nenvs)

    # load name is specified by config.RESTORE_ID adn return True/False
    if checkpoint != 32:
        base_name = str(8 * checkpoint) + 'M'
    elif checkpoint == 0:
        mean_score = 0.0
        succ_rate = 0.0
        wandb.log({
            'Rew_mean': mean_score,
            'Succ_rate': succ_rate,
            'Step_elapsed': steps_elapsed
        })
        return mean_score, succ_rate
    else:
        base_name = None

    sess.run(tf.global_variables_initializer())
    # env init here
    load_file = setup_utils.restore_file(Config.RESTORE_ID,
                                         overlap_config=overlap,
                                         base_name=base_name)

    is_loaded = utils.load_params_for_scope(sess, 'model')
    if not is_loaded:
        mpi_print('NO SAVED PARAMS LOADED')
        return mean_score, succ_rate

    obs = env.reset()
    t_step = 0

    scores = np.zeros((nenvs, rep_count))
    eplens = np.zeros((nenvs, rep_count))
    #scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)

    # curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    def rollout(obs, state, done):
        """rollout for rep * nenv times and return scores"""
        t = 0
        count = 0
        rews = np.zeros((nenvs, rep_count))
        while should_continue():
            action, values, state, _ = agent.step(obs, state, done)
            obs, rew, done, info = env.step(action)
            rews[:, count] += rew
            t += 1

            for i, d in enumerate(done):
                if d:
                    eplens[i][count] = t
                    if score_counts[i] < rep_count:
                        score_counts[i] += 1
                        count = score_counts[i] - 1
                        # aux score
                        if 'episode' in info[i]:
                            scores[i][count] = info[i].get('episode')['r']

        return scores, rews, eplens

    if is_loaded:
        mpi_print(load_file)
        scores, rews, eplens = rollout(obs, state, done)

    size = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    if size == 1:
        if rank == 0:
            testset_size = rep_count * nenvs
            utils.save_pickle(scores, Config.LOGDIR + 'scores')
            mean_score = np.sum(scores) / testset_size
            succ_rate = np.sum(scores == 10.0) / testset_size
            mpi_print('cpus ', size)
            mpi_print('testset size', testset_size)
            # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs
            # each one has a new seed(maybe counted)
            # mpi_print('score detail',scores.flatten())
            mpi_print('succ_rate', succ_rate)
            steps_elapsed = checkpoint * 8000000
            mpi_print('steps_elapsed:', steps_elapsed)
            mpi_print('mean score', mean_score)
            wandb.log({
                'Rew_mean': mean_score,
                'Succ_rate': succ_rate,
                'Step_elapsed': steps_elapsed
            })
            #mpi_print('mean score of each env',[np.mean(s) for s in scores])
    else:
        testset_size = rep_count * nenvs
        succ = np.sum(scores=10.0) / testset_size
        succ_rate = utils.mpi_average([succ])
        mean_score_tmp = np.sum(scores) / testset_size
        mean_score = utils.mpi_average([mean_score_tmp])
        if rank == 0:
            mpi_print('testset size', rep_count * nenvs * size)
            mpi_print('load file name', load_file)
            mpi_print('testset size', testset_size)
            # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs
            # each one has a new seed(maybe counted)
            # mpi_print('score detail',scores.flatten())
            mpi_print('succ_rate', succ_rate)
            mpi_print('mean score', mean_score)
            wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate})

    return mean_score, succ_rate
コード例 #16
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    # coinrun version, allows you to specify how many GPUs you want this run to use
    #utils.setup_mpi_gpus()

    # baselines version, just sets the number of GPUs to the -n flag
    #setup_mpi_gpus()
    os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(Config.NUM_GPUS)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS

    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(200e6)
    elif Config.SHORT_TRAINING:
        #total_timesteps = int(120e6)
        total_timesteps = int(25e6)
    elif Config.VERY_SHORT_TRAINING:
        total_timesteps = int(5e6)
    save_interval = args.save_interval

    #env = utils.make_general_env(nenvs, seed=rank)
    #print (env)

    print(Config.ENVIRONMENT)

    baseline_vec = ProcgenEnv(num_envs=nenvs,
                              env_name=Config.ENVIRONMENT,
                              num_levels=Config.NUM_LEVELS,
                              paint_vel_info=Config.PAINT_VEL_INFO,
                              distribution_mode="easy")
    gym3_env = ProcgenGym3Env(num=nenvs,
                              env_name=Config.ENVIRONMENT,
                              num_levels=Config.NUM_LEVELS,
                              paint_vel_info=Config.PAINT_VEL_INFO,
                              distribution_mode="easy")
    venv = FakeEnv(gym3_env, baseline_vec)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)

    #sys.exit(0)
    with tf.Session(config=config) as sess:
        #env = wrappers.add_final_wrappers(env)
        venv = wrappers.add_final_wrappers(venv)

        policy = policies.get_policy()

        #sess.run(tf.global_variables_initializer())
        ppo2.learn(
            policy=policy,
            env=venv,
            #env=env,
            save_interval=save_interval,
            nsteps=Config.NUM_STEPS,
            nminibatches=Config.NUM_MINIBATCHES,
            lam=0.95,
            gamma=Config.GAMMA,
            noptepochs=Config.PPO_EPOCHS,
            log_interval=1,
            ent_coef=Config.ENTROPY_COEFF,
            lr=lambda f: f * Config.LEARNING_RATE,
            cliprange=lambda f: f * 0.2,
            total_timesteps=total_timesteps)
コード例 #17
0
ファイル: image_policy_bco.py プロジェクト: zhenchangXia/ILPO
from collections import deque
import gym
import cv2
import os
import coinrun.main_utils as utils
from coinrun import setup_utils, policies, wrappers, ppo2
from coinrun.config import Config
#from gym.envs.classic_control import rendering
from collections import deque
import random
from image_bco import ImageBCO

utils.setup_mpi_gpus()
setup_utils.setup_and_load()
game = utils.make_general_env(1)
game = wrappers.add_final_wrappers(game)
game.reset()

args.checkpoint = 'coin_ilpo'
args.input_dir = 'final_models/coin'
args.exp_dir = 'results/final_coin_bco'
args.n_actions = 4
args.real_actions = 4
args.policy_lr = .0001
args.batch_size = 100
args.ngf = 15
states = []
next_states = []
FINAL_EPSILON = .2 # final value of epsilon
INITIAL_EPSILON = .2 # starting value of epsilon
EXPLORE = 1000
コード例 #18
0
def enjoy_env_sess(sess, DIR_NAME):
    should_render = True
    should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL
    rep_count = Config.REP
    mpi_print = utils.mpi_print

    file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID)
    f_io = open(file_name, 'a')

    if should_eval:
        if Config.TEST_NUM_EVAL > -1:
            env = utils.make_general_env(Config.TEST_NUM_EVAL)
        else:
            env = utils.make_general_env(Config.NUM_EVAL)
        should_render = False
    else:
        env = utils.make_general_env(1)

    env = wrappers.add_final_wrappers(env)

    if should_render:
        from gym.envs.classic_control import rendering

    nenvs = env.num_envs

    agent = create_act_model(sess, env, nenvs)
    num_actions = env.action_space.n

    init_rand = tf.variables_initializer(
        [v for v in tf.global_variables() if 'randcnn' in v.name])
    sess.run(tf.compat.v1.global_variables_initializer())

    soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions],
                                name='soft_numpy')
    dist = tfp.distributions.Categorical(probs=soft_numpy)
    sampled_action = dist.sample()

    loaded_params = utils.load_params_for_scope(sess, 'model')

    if not loaded_params:
        print('NO SAVED PARAMS LOADED')

    obs = env.reset()
    t_step = 0

    if should_render:
        viewer = rendering.SimpleImageViewer()

    should_render_obs = not Config.IS_HIGH_RES

    def maybe_render(info=None):
        if should_render and not should_render_obs:
            env.render()

    maybe_render()

    scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)
    curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    sess.run(init_rand)
    while should_continue():
        if Config.USE_LSTM == 8425 or Config.USE_LSTM == 1081:
            q_actions, values, state, _ = agent.step(obs, state, done)
            # e-greedy
            greedy_flag = np.random.rand(q_actions.shape[0])
            greedy_flag = greedy_flag < 0.1
            greedy_flag.astype(np.int)
            random_actions = np.random.randint(0,
                                               num_actions,
                                               size=q_actions.shape[0])
            action = random_actions * greedy_flag + (1 -
                                                     greedy_flag) * q_actions
        else:
            total_soft = agent.get_softmax(obs, state, done)
            action = sess.run([sampled_action], {soft_numpy: total_soft})
            action = action[0]
            #action, values, state, _ = agent.step(obs, state, done)

        obs, rew, done, info = env.step(action)
        #scipy.misc.imsave('raw_inputs.png', obs[0])
        #print(dd)

        if should_render and should_render_obs:
            if np.shape(obs)[-1] % 3 == 0:
                ob_frame = obs[0, :, :, -3:]
            else:
                ob_frame = obs[0, :, :, -1]
                ob_frame = np.stack([ob_frame] * 3, axis=2)
            viewer.imshow(ob_frame)

        curr_rews[:, 0] += rew

        for i, d in enumerate(done):
            if d:
                if score_counts[i] < rep_count:
                    score_counts[i] += 1

                    if 'episode' in info[i]:
                        scores[i] += info[i].get('episode')['r']

        maybe_render(info[0])

        t_step += 1

        if should_render:
            time.sleep(.02)

        if done[0]:
            if should_render:
                mpi_print('ep_rew', curr_rews)

            curr_rews[:] = 0

    result = 0

    if should_eval:
        mean_score = np.mean(scores) / rep_count
        max_idx = np.argmax(scores)

        result = mean_score

        f_io.write("{}\n".format(result))
        f_io.close()

    return result