Beispiel #1
0
def Train():
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               set_seed=3,
                               num_levels=1,
                               use_black_white=True,
                               frame_stack=4)
    # env=make("platform",num_envs=8)
    env = make("platform", num_envs=8)
    env = CourierWrapper(env, True)
    env = MyReward(env)
    # env = VecMonitor(env)
    learning_rate = 3e-4
    clip_range = 0.2
    n_timesteps = int(1e8)
    hyperparmas = {
        'nsteps': 256,
        'noptepochs': 4,
        'nminibatches': 8,
        'lr': learning_rate,
        'cliprange': clip_range,
        'vf_coef': 0.5,
        'ent_coef': 0.01
    }

    act = ppo2.learn(
        network=MyPolicy,
        env=env,
        total_timesteps=n_timesteps,
        **hyperparmas,
        save_interval=100,
        log_interval=20,

        # value_network="copy"
    )
Beispiel #2
0
def create_env(env_name, flags):
    if env_name.startswith('Coin'):
        from coinrun import coinrunenv
        from coinrun import setup_utils as coinrun_setup_utils
        coinrun_setup_utils.setup_and_load(
            use_cmd_line_args=False,
            set_statics=flags.set_statics,
            set_dynamics=flags.set_dynamics,
            num_levels=flags.num_levels,
            any_custom_game=flags.any_custom_game,
            use_pytorch=True,
            paint_vel_info=0,
            is_high_res=flags.is_high_res,
            default_zoom=flags.default_zoom,
            float_obs=False)  # torchbeast divides by 255
        return CoinRunOneEnv('platform',
                             1,
                             default_zoom=flags.default_zoom,
                             float_obs=False)
    else:
        return atari_wrappers.wrap_pytorch(
            atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                         clip_rewards=False,
                                         frame_stack=True,
                                         scale=False))
Beispiel #3
0
def test_coinrun():
    setup_utils.setup_and_load(use_cmd_line_args=False)
    env = make('CoinRun-v0', num_envs=16)
    for _ in range(100):
        acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])
        _obs, _rews, _dones, _infos = env.step(acts)
    env.close()
def main():
    setup_utils.setup_and_load(paint_vel_info=0, use_cmd_line_args=True)
    print("""Control with arrow keys,
F1, F2 -- switch resolution,
F5, F6, F7, F8 -- zoom,
F9  -- switch reconstruction target picture,
F10 -- switch lasers
    """)
    lib.test_main_loop()
Beispiel #5
0
def random_agent(num_envs=1, max_steps=100000):
    setup_utils.setup_and_load(use_cmd_line_args=False)
    env = make('standard', num_envs=num_envs)
    for step in range(max_steps):
        acts = np.array(
            [env.action_space.sample() for _ in range(env.num_envs)])
        _obs, rews, _dones, _infos = env.step(acts)
        print("step", step, "rews", rews)
    env.close()
Beispiel #6
0
 def __init__(self):
     self.AE = AutoEncoder(args,
                           latent_dim=args.latent_dim).double().to(device)
     self.AE.train()
     self.counter = 0
     self.buffer = np.empty(args.buffer_capacity, dtype=transition)
     setup_utils.setup_and_load(use_cmd_line_args=False)
     self.env = make('standard', num_envs=args.num_envs)
     self.optimizer = optim.Adam(self.AE.parameters(), lr=args.lr)
     self.criterion = nn.MSELoss()
     self.step = 0
Beispiel #7
0
def main():
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load(num_levels=0, starting_level=0, paint_vel_info=1,
            restore_id='start0numlev250_256mts', train_eval =True, test_eval = False, num_eval=100, high_difficulty=False)
    print("High difficulty: " + str(Config.HIGH_DIFFICULTY))
    frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options)
    nogpu_config = tf.ConfigProto(device_count = {'GPU': 0})
    with tf.Session(config=nogpu_config) as sess:
    #with tf.Session(config=frac_gpu_config) as sess:
        enjoy_env_sess(sess)
Beispiel #8
0
def create_coinrun_env(num_levels, task_id, random_seed_list):
    # setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed)
    try:
        random_seed = random_seed_list[task_id]
    except:
        random_seed = 123
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=num_levels,
                               set_seed=random_seed)
    env = make('standard', num_envs=1)
    return env
Beispiel #9
0
def testing():
    setup_utils.setup_and_load()
    episodes = 10
    env = Scalarize(make('standard', num_envs=1))
    for i in range(episodes):
        env.reset()
        while True:
            env.render()
            action = np.random.randint(0, env.action_space.n)
            next_state, reward, done, info = env.step(action)
            if done or reward > 0:
                break
Beispiel #10
0
def random_agent(num_envs=1, max_steps=100000):
    setup_utils.setup_and_load(use_cmd_line_args=True)
    print(Config.IS_HIGH_RES)
    env = make('standard', num_envs=num_envs)
    env.render()
    viewer = rendering.SimpleImageViewer()
    for step in range(max_steps):
        acts = np.array(
            [env.action_space.sample() for _ in range(env.num_envs)])
        _obs, rews, _dones, _infos = env.step(acts)
        print("step", step, "rews", rews)
        env.render()
    env.close()
Beispiel #11
0
def make_coinrun():
    from coinrun import setup_utils, make
    from coinrun_wrapper import CourierWrapper, MyReward
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               set_seed=3,
                               num_levels=1,
                               use_black_white=True,
                               frame_stack=4)
    # env=make("platform",num_envs=8)
    env = make("platform", num_envs=256)
    env = CourierWrapper(env, False)
    env = MyReward(env)
    return env
Beispiel #12
0
def main():
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load()
    DIR_NAME = Config.TEST_LOG_NAME

    if not os.path.exists(DIR_NAME):
        os.makedirs(DIR_NAME)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.compat.v1.Session(config=config) as sess:
        results = enjoy_env_sess(sess, DIR_NAME)
        print(results)
def main():

    setup_utils.setup_and_load(use_cmd_line_args=False)

    # Make the base enviroments that we will train the agent on first, this makes 1 gym enviroment
    # But after each epoch a different enviroment will be choosen, currently we only use 1 enviroment,
    # because it works better with the dqn algorithm
    base_env = make('standard', num_envs=1)
    base_env = CoinRunVecEnvWrapper(base_env)

    #base_env = wrappers.add_final_wrappers(base_env)
    # Make the enviroment that we will attempt to transfer to
    transfer_enviroment = make('standard', num_envs=1)
    transfer_enviroment = CoinRunVecEnvWrapper(transfer_enviroment)

    t = int(5e3)
    with tf.Session():
        model = make_model()

        print("-----\ntraining base model on training enviroment\n-----")
        base_statistics = run_deepq(model if model else 'cnn',
                                    base_env,
                                    total_timesteps=t,
                                    name="base")

        print('mean reward: ', np.mean(np.array(base_statistics['rewards'])))

        print("-----\ntraining transfer model on test enviroment\n-----")
        transfer_statistics = run_deepq(model if model else 'cnn',
                                        transfer_enviroment,
                                        total_timesteps=t,
                                        name="transfer")
        print('mean reward: ',
              np.mean(np.array(transfer_statistics['rewards'])))

        model = make_model()
        print("-----\ntraining non-transfer model on test enviroment\n-----")
        transfer_enviroment_base_model_statistics = run_deepq(
            model if model else 'cnn',
            transfer_enviroment,
            total_timesteps=t,
            name="transfer")
        print(
            'mean reward: ',
            np.mean(
                np.array(
                    transfer_enviroment_base_model_statistics['rewards'])))
        plot_stats(base_statistics, transfer_statistics,
                   transfer_enviroment_base_model_statistics)
Beispiel #14
0
def random_agent(num_envs=1, max_steps=100000):
    #random environment
    # setup_utils.setup_and_load(use_cmd_line_args=False)
    #just test in level1 with config --run-id myrun --num-levels 1
    setup_utils.setup_and_load()
    env = make('standard', num_envs=num_envs)
    imgNum = 0
    for step in range(100000):
        env.render()
        #acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])

        foo = [1, 3]
        acts = np.array([random.choice(foo)])
        #0: no move
        #1:right move
        #2: move but stay
        #3:jump
        #4:down
        #5:down
        #6:down

        # 0, 0,
        # +1, 0, // right
        # -1, 0, // left
        # 0, +1, // jump
        # +1, +1, // right - jump
        # -1, +1, // left - jump
        # 0, -1, // down(step down from a crate)

        print("python input action: ", acts)
        print("\n env.step(acts): \n")
        _obs, rews, _dones, _infos = env.step(acts)
        #todo:return distance (change _obs to distance) then condition

        img_input = img.imgbuffer_process(_obs, (256, 256))

        if step % 50 == 0:
            #turn gray
            #todo:make coinrunMOXCS consume gray img
            #plt.imsave('%i.jpg' % (imgNum), img_input.mean(axis=2), cmap = "gray")
            # plt.imsave('%i.jpg' % (imgNum), img_input)
            #plt.imshow(img_input.mean(axis=2), cmap="gray")
            imgNum = imgNum + 1
            print("imgNum:%i" % (imgNum))

        print("step", step, "rews", rews)
    env.close()
Beispiel #15
0
def main():
    args = setup_utils.setup_and_load()
    setup_utils.load_for_setup_if_necessary()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes=" baseline train",
               tags=["baseline", Config.RUN_ID.split('-')[0]],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()
    utils.mpi_print('Set up gpu')
    utils.mpi_print(args)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs is how many envs run parallel on a cpu
    # VenEnv class allows parallel rollout
    nenvs = Config.NUM_ENVS
    total_timesteps = int(256 * 10**6)

    env = utils.make_general_env(nenvs, seed=rank)
    utils.mpi_print('Set up env')

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = policies_back.get_policy()
        #policy = policies.get_policy()
        utils.mpi_print('Set up policy')

        learn_func(policy=policy,
                   env=env,
                   log_interval=args.log_interval,
                   save_interval=args.save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=Config.GAE_LAMBDA,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   ent_coef=Config.ENTROPY_COEFF,
                   vf_coef=Config.VF_COEFF,
                   max_grad_norm=Config.MAX_GRAD_NORM,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * Config.CLIP_RANGE,
                   total_timesteps=total_timesteps)
Beispiel #16
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  device,
                  allow_early_resets,
                  num_frame_stack=None,
                  coin_run_level=0,
                  coin_run_seed=-1,
                  difficulty=False):
    # coinrun environments need to be treated differently.
    coinrun_envs = {
        'CoinRun': 'standard',
        'CoinRun-Platforms': 'platform',
        'Random-Mazes': 'maze'
    }
    if env_name in coinrun_envs:
        coin_run_args = setup_utils.setup_and_load(use_cmd_line_args=False)
        Coinrun_Config.GAME_TYPE = coinrun_envs[env_name]
        Coinrun_Config.NUM_LEVELS = coin_run_level
        Coinrun_Config.SET_SEED = coin_run_seed
        # If SET_SEED = -1, this seed is not used and level seeds will be drawn from the
        # range [0, NUM_LEVELS). Use SET_SEED = -1 and NUM_LEVELS = 500 to train with the same levels in the paper.
        Coinrun_Config.NUM_ENVS = num_processes
        Coinrun_Config.HIGH_DIFFICULTY = difficulty
        envs = coinrun_utils.make_general_env(num_processes)
        envs.spec = Coinrun_Config.GAME_TYPE
        envs = CoinRunVecPyTorch(envs, device)
        envs = add_final_pytorch_wrappers(envs)

    else:
        envs = [
            make_env(env_name, seed, i, log_dir, allow_early_resets)
            for i in range(num_processes)
        ]

        if len(envs) > 1:
            envs = ShmemVecEnv(envs, context='fork')
        else:
            envs = DummyVecEnv(envs)

        if len(envs.observation_space.shape) == 1:
            if gamma is None:
                envs = VecNormalize(envs, ret=False)
            else:
                envs = VecNormalize(envs, gamma=gamma)

        envs = VecPyTorch(envs, device)

        if num_frame_stack is not None:
            envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
        elif len(envs.observation_space.shape) == 3:
            envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Beispiel #17
0
def main():
    """The main function."""

    setup_utils.setup_and_load(is_high_res=True)

    config, unparsed = dqnconfig.get_config()
    # ----------------------------------------
    # Parse configuration
    # If we have unparsed arguments, print usage and exit
    if len(unparsed) > 0:
        print("unparsed for DQN :", unparsed)
        # input("Press Enter to continue...")

    if config.mode == "train":
        train(config)
    elif config.mode == "test":
        test(config)
    else:
        raise ValueError("Unknown run mode \"{}\"".format(config.mode))
Beispiel #18
0
  def __init__(self, hparams):
    # only support 1 environment currently
    super().__init__(hparams)
    try:
      from coinrun import setup_utils, make
      setup_utils.setup_and_load(use_cmd_line_args=False)

      self._env = make('standard', num_envs=1)
    except ImportError as e:
      print(e)
      print("please check README for CoinRun installation instruction")
      exit()
    self.seed(1234)
    self._observation_space = self._env.observation_space
    self._action_space = self._env.action_space
    self._hparams.num_states = self._observation_space.shape[0]
    self._hparams.num_actions = self._action_space.n
    self._hparams.state_shape = list(self._observation_space.shape)
    self._hparams.action_space_type = self._action_space.__class__.__name__
    self._hparams.pixel_input = True
    if self._hparams.reward_augmentation is not None:
      self._reward_augmentation = get_reward_augmentation(
          self._hparams.reward_augmentation)
Beispiel #19
0
def main():
    args = setup_utils.setup_and_load(num_levels=250,
                                      starting_level=0,
                                      paint_vel_info=1,
                                      run_id='start0numlev250_256mts_dann_low',
                                      num_envs=32)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    #config = tf.ConfigProto()
    frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options)
    nogpu_config = tf.ConfigProto(device_count={'GPU': 0})
    #config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    print("Num envs: " + str(Config.NUM_ENVS))
    total_timesteps = int(256e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=frac_gpu_config):
        #with tf.Session(config=nogpu_config):
        env = wrappers.add_final_wrappers(env)

        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                   env=env,
                   save_interval=save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=0.95,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   log_interval=1,
                   ent_coef=Config.ENTROPY_COEFF,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * 0.2,
                   total_timesteps=total_timesteps)
Beispiel #20
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes="network randomization",
               tags=["baseline"],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(256e6)

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = nr_policies.get_policy()

        nr_ppo2.learn(policy=policy,
                      env=env,
                      save_interval=args.save_interval,
                      nsteps=Config.NUM_STEPS,
                      nminibatches=Config.NUM_MINIBATCHES,
                      lam=0.95,
                      gamma=Config.GAMMA,
                      noptepochs=Config.PPO_EPOCHS,
                      log_interval=1,
                      ent_coef=Config.ENTROPY_COEFF,
                      lr=lambda f: f * Config.LEARNING_RATE,
                      cliprange=lambda f: f * 0.2,
                      total_timesteps=total_timesteps)
Beispiel #21
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(160e6)
    if Config.LONG_TRAINING:
        total_timesteps = int(200e6)
    elif Config.SHORT_TRAINING:
        total_timesteps = int(120e6)
    save_interval = args.save_interval

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        
        policy = policies.get_policy()

        ppo2.learn(policy=policy,
                    env=env,
                    save_interval=save_interval,
                    nsteps=Config.NUM_STEPS,
                    nminibatches=Config.NUM_MINIBATCHES,
                    lam=0.95,
                    gamma=Config.GAMMA,
                    noptepochs=Config.PPO_EPOCHS,
                    log_interval=1,
                    ent_coef=Config.ENTROPY_COEFF,
                    lr=lambda f : f * Config.LEARNING_RATE,
                    cliprange=lambda f : f * 0.2,
                    total_timesteps=total_timesteps)
Beispiel #22
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    main_utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank))
    print("load path:")
    print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))
    act = deepq.learn(
        env,
        network="conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        total_timesteps=0,
        load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH,
                                                  Config.RUN_ID)
        # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID)
    )

    num_episodes = 500
    # while True:
    episode_rew_ls = []
    for i in range(num_episodes):
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if Config.RENDER:
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        episode_rew_ls.append(episode_rew)
        print("Episode reward", episode_rew)
    print("Avg episode reward", np.mean(episode_rew_ls))
    print("Var episode reward", np.std(episode_rew_ls))
Beispiel #23
0
def main():
    # load from restore file
    args_dict = utils.load_args()
    # train args of restore id
    test_args = setup_utils.setup_and_load()
    if 'NR' in Config.RESTORE_ID:
        Config.USE_LSTM = 2
    if 'dropout' in Config.RESTORE_ID:
        Config.DROPOUT = 0
        Config.USE_BATCH_NORM = 0

    wandb.init(project="coinrun",
               notes="test",
               tags=["baseline", "test"],
               config=Config.get_args_dict())

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    seed = np.random.randint(100000)
    Config.SET_SEED = seed

    overlap = {
        'set_seed': Config.SET_SEED,
        'rep': Config.REP,
        'highd': Config.HIGH_DIFFICULTY,
        'num_levels': Config.NUM_LEVELS,
        'use_lstm': Config.USE_LSTM,
        'dropout': Config.DROPOUT,
        'use_batch_norm': Config.USE_BATCH_NORM
    }

    load_file = Config.get_load_filename(restore_id=Config.RESTORE_ID)
    mpi_print('load file name', load_file)
    mpi_print('seed', seed)
    mpi_print("---------------------------------------")
    for checkpoint in range(1, 33):
        with tf.Session() as sess:
            steps_elapsed = checkpoint * 8000000
            mpi_print('steps_elapsed:', steps_elapsed)
            enjoy_env_sess(sess, checkpoint, overlap)
Beispiel #24
0
from utils import *
from collections import deque
import gym
import cv2
import os
import coinrun.main_utils as utils
from coinrun import setup_utils, policies, wrappers, ppo2
from coinrun.config import Config
#from gym.envs.classic_control import rendering
from collections import deque
import random
from image_bco import ImageBCO

utils.setup_mpi_gpus()
setup_utils.setup_and_load()
game = utils.make_general_env(1)
game = wrappers.add_final_wrappers(game)
game.reset()

args.checkpoint = 'coin_ilpo'
args.input_dir = 'final_models/coin'
args.exp_dir = 'results/final_coin_bco'
args.n_actions = 4
args.real_actions = 4
args.policy_lr = .0001
args.batch_size = 100
args.ngf = 15
states = []
next_states = []
FINAL_EPSILON = .2 # final value of epsilon
Beispiel #25
0
def main():
    utils.setup_mpi_gpus()
    setup_utils.setup_and_load()
    with tf.Session() as sess:
        enjoy_env_sess(sess)
Beispiel #26
0
def enjoy_env_sess():
    # utils.setup_mpi_gpus()
    # setup_utils.setup_and_load({'restore_id': collecting_model})

    directory = './images/'
    directory_saliency = "./images_saliency"

    def create_saliency(model_idx, sess):
        graph = tf.get_default_graph()
        env = utils.make_general_env(1)
        env = wrappers.add_final_wrappers(env)
        agent = create_act_model(sess, env, 1)
        action_selector = tf.placeholder(tf.int32)
        gradient_saliency = saliency.GradientSaliency(
            graph, sess, agent.pd.logits[0][action_selector], agent.X)
        sess.run(tf.global_variables_initializer())

        # setup_utils.restore_file(models[model_idx])
        try:
            loaded_params = utils.load_params_for_scope(sess, 'model')
            if not loaded_params:
                print('NO SAVED PARAMS LOADED')
        except AssertionError as e:
            models[model_idx] = None
            return [None] * 3
        return agent, gradient_saliency, action_selector

    orig_images_low = []
    orig_images_high = []
    filenames = []

    print("Loading files...")
    for idx, filename in enumerate(os.listdir(directory)):
        if len(filename) > 15 or os.path.isdir(
                os.path.join(directory, filename)):
            continue
        print('.', end='')
        img = imageio.imread(os.path.join(directory, filename))
        img = img.astype(np.float32)
        if filename.startswith('img_') and len(filename) < 15:
            filenames.append(filename)
            list_to_append = orig_images_low
        if filename.startswith('imgL_') and len(filename) < 15:
            list_to_append = orig_images_high
        list_to_append.append(img)

    list_of_images_lists = []  # First one for 0
    list_of_vmax_lists = []

    for idx, model_name in enumerate(models):
        if model_name is None:
            list_of_images_lists.append(None)
            list_of_vmax_lists.append(None)
            continue

        model_images = []
        vmaxs = []
        config.Config = config.ConfigSingle()
        setup_utils.setup_and_load(use_cmd_line_args=False,
                                   restore_id=model_name,
                                   replay=True)
        print("\nComputing saliency for Model {}\{}: {}...".format(
            idx,
            len(models) - 1, names[model_name]))

        with tf.compat.v1.Session() as sess:
            agent, gradient_saliency, action_selector = create_saliency(
                idx, sess)
            for img in orig_images_low:
                print('.', end='')
                sys.stdout.flush()
                action, values, state, _ = agent.step(np.expand_dims(img, 0),
                                                      agent.initial_state,
                                                      False)
                s_vanilla_mask_3d = gradient_saliency.GetSmoothedMask(
                    img,
                    feed_dict={
                        'model/is_training_:0': False,
                        action_selector: action[0]
                    })
                s_vanilla_mask_grayscale, vmax = saliency.VisualizeImageGrayscale(
                    s_vanilla_mask_3d)
                model_images.append(s_vanilla_mask_grayscale)
                vmaxs.append(vmax)

            list_of_images_lists.append(model_images)
            list_of_vmax_lists.append(vmaxs)

    print("\nMaking pretty images..")
    for idx, filename in enumerate(filenames):
        print('.', end='')
        sys.stdout.flush()
        P.figure(figsize=(COLS * UPSCALE_FACTOR, ROWS * UPSCALE_FACTOR))
        ShowImage(orig_images_high[idx] / 255,
                  title="Original",
                  ax=P.subplot(ROWS, COLS, 1))
        for row in range(ROWS):
            for col in range(COLS):
                model_idx = col + row * COLS
                if models[model_idx] is None:
                    continue
                ShowGrayscaleImage(list_of_images_lists[model_idx][idx],
                                   title=names[models[model_idx]] +
                                   "     Vmax: {:.2E}".format(
                                       list_of_vmax_lists[model_idx][idx]),
                                   ax=P.subplot(ROWS, COLS, model_idx + 1))
        P.savefig(
            os.path.join(directory_saliency, filename[:-4] + "_saliency.png"))
        P.close()
    print("\nDone")
Beispiel #27
0
import numpy as np
from coinrun import setup_utils, make

config_args = setup_utils.setup_and_load(use_cmd_line_args=False)
env = make('standard', num_envs=4)
for _ in range(1000):
    env.render()
    acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])
    _obs, _rews, _dones, _infos = env.step(acts)
env.close()
Beispiel #28
0
def train(num_episodes=NUM_EPISODES,
          load_filename=None,
          save_filename=None,
          eval_interval=EVAL_INTERVAL,
          replay_capacity=REPLAY_CAPACITY,
          bootstrap_threshold=BOOTSTRAP,
          epsilon=EPSILON,
          eval_epsilon=EVAL_EPSILON,
          gamma=GAMMA,
          batch_size=BATCH_SIZE,
          target_update=TARGET_UPDATE,
          random_seed=RANDOM_SEED,
          num_levels=NUM_LEVELS,
          seed=SEED):
    # Set the random seed
    if random_seed is not None:
        random.seed(random_seed)
        torch.manual_seed(random_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(RANDOM_SEED)
    # Set up the environment
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=num_levels,
                               set_seed=seed)
    env = make('standard', num_envs=1)
    if RENDER_SCREEN and not IN_PYNB:
        env.render()

    # Reset the environment
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape returned from AI gym.
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape
    print("screen size: ", screen_height, screen_width)

    # Are we resuming from an existing model?
    policy_net = None
    if load_filename is not None and os.path.isfile(
            os.path.join(MODEL_PATH, load_filename)):
        print("Loading model...")
        policy_net = load_model(load_filename)
        policy_net = policy_net.to(DEVICE)
        print("Done loading.")
    else:
        print("Making new model.")
        policy_net = DQN(screen_height, screen_width,
                         env.NUM_ACTIONS).to(DEVICE)
    # Make a copy of the policy network for evaluation purposes
    eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE)
    eval_net.load_state_dict(policy_net.state_dict())
    eval_net.eval()
    # Target network is a snapshot of the policy network that lags behind (for stablity)
    target_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Instantiate the optimizer
    optimizer = None
    if len(list(policy_net.parameters())) > 0:
        optimizer = initializeOptimizer(policy_net.parameters())

    # Instantiate the replay memory
    replay_memory = ReplayMemory(replay_capacity)

    steps_done = 0  # How many steps have been run
    best_eval = float('inf')  # The best model evaluation to date

    # Do training until episodes complete
    print("training...")
    i_episode = 0  # The episode number

    # Stop when we reach max episodes
    while i_episode < num_episodes:
        print("episode:", i_episode, "epsilon:", epsilon)
        max_reward = 0  # The best reward we've seen this episode
        done = False  # Has the game ended (timed out or got the coin)
        episode_steps = 0  # Number of steps performed in this episode
        # Initialize the environment and state
        env.reset()

        # Current screen. There is no last screen because we get velocity on the screen itself.
        state = get_screen(env)

        # Do forever until the loop breaks
        while not done:
            # Select and perform an action
            action, epsilon = select_action(state, policy_net, env.NUM_ACTIONS,
                                            epsilon, steps_done,
                                            bootstrap_threshold)
            steps_done = steps_done + 1
            episode_steps = episode_steps + 1

            # for debugging
            if RENDER_SCREEN and not IN_PYNB:
                env.render()

                # Run the action in the environment
            if action is not None:
                _, reward, done, _ = env.step(np.array([action.item()]))

                # Record if this was the best reward we've seen so far
                max_reward = max(reward, max_reward)

                # Turn the reward into a tensor
                reward = torch.tensor([reward], device=DEVICE)

                # Observe new state
                current_screen = get_screen(env)

                # Did the game end?
                if not done:
                    next_state = current_screen
                else:
                    next_state = None

                # Store the transition in memory
                replay_memory.push(state, action, next_state, reward)

                # Move to the next state
                state = next_state

                # If we are past bootstrapping we should perform one step of the optimization
                if steps_done > bootstrap_threshold:
                    optimize_model(
                        policy_net,
                        target_net if target_update > 0 else policy_net,
                        replay_memory, optimizer, batch_size, gamma)
            else:
                # Do nothing if select_action() is not implemented and returning None
                env.step(np.array([0]))

            # If we are done, print some statistics
            if done:
                print("duration:", episode_steps)
                print("max reward:", max_reward)
                status, _ = episode_status(episode_steps, max_reward)
                print("result:", status)
                print("total steps:", steps_done, '\n')

            # Should we update the target network?
            if target_update > 0 and i_episode % target_update == 0:
                target_net.load_state_dict(policy_net.state_dict())

        # Should we evaluate?
        if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0:
            test_average_duration = 0  # Track the average eval duration
            test_average_max_reward = 0  # Track the average max reward
            # copy all the weights into the evaluation network
            eval_net.load_state_dict(policy_net.state_dict())
            # Evaluate 10 times
            for _ in range(EVAL_COUNT):
                # Call the evaluation function
                test_duration, test_max_reward = evaluate(
                    eval_net, eval_epsilon, env)
                status, score = episode_status(test_duration, test_max_reward)
                test_duration = score  # Set test_duration to score to factor in death-penalty
                test_average_duration = test_average_duration + test_duration
                test_average_max_reward = test_average_max_reward + test_max_reward
            test_average_duration = test_average_duration / EVAL_COUNT
            test_average_max_reward = test_average_max_reward / EVAL_COUNT
            print("Average duration:", test_average_duration)
            print("Average max reward:", test_average_max_reward)
            # If this is the best window average we've seen, save the model
            if test_average_duration < best_eval:
                best_eval = test_average_duration
                if save_filename is not None:
                    save_model(policy_net, save_filename, i_episode)
            print(' ')
        # Only increment episode number if we are done with bootstrapping
        if steps_done > bootstrap_threshold:
            i_episode = i_episode + 1
    print('Training complete')
    if RENDER_SCREEN and not IN_PYNB:
        env.render()
    env.close()
    return policy_net
Beispiel #29
0
def evaluate(policy_net, epsilon=EVAL_EPSILON, env=None, test_seed=SEED):
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=NUM_LEVELS,
                               set_seed=test_seed)

    # Make an environment if we don't already have one
    if env is None:
        env = make('standard', num_envs=1)
    if RENDER_SCREEN and not IN_PYNB:
        env.render()

    # Reset the environment
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym.
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape

    # Get the network ready for evaluation (turns off some things like dropout if used)
    policy_net.eval()

    # Current screen. There is no last screen
    state = get_screen(env)

    steps_done = 0  # Number of steps executed
    max_reward = 0  # Max reward seen
    done = False  # Is the game over?

    print("Evaluating...")
    while not done:
        # Select and perform an action
        action, _ = select_action(state,
                                  policy_net,
                                  env.NUM_ACTIONS,
                                  epsilon,
                                  steps_done=0,
                                  bootstrap_threshold=0)
        steps_done = steps_done + 1

        if RENDER_SCREEN and not IN_PYNB:
            env.render()

        # Execute the action
        if action is not None:
            _, reward, done, _ = env.step(np.array([action.item()]))

            # Is this the best reward we've seen?
            max_reward = max(reward, max_reward)

            # Observe new state
            state = get_screen(env)
        else:
            # Do nothing if select_action() is not implemented and returning None
            env.step(np.array([0]))

    print("duration:", steps_done)
    print("max reward:", max_reward)
    status, _ = episode_status(steps_done, max_reward)
    print("result:", status, '\n')
    if RENDER_SCREEN and not IN_PYNB:
        env.render()
    return steps_done, max_reward
Beispiel #30
0
def train(num_episodes=NUM_EPISODES,
          load_filename=None,
          save_filename=None,
          eval_interval=EVAL_INTERVAL,
          replay_capacity=REPLAY_CAPACITY,
          bootstrap_threshold=BOOTSTRAP,
          epsilon=EPSILON,
          eval_epsilon=EVAL_EPSILON,
          gamma=GAMMA,
          batch_size=BATCH_SIZE,
          num_levels=NUM_LEVELS,
          seed=SEED):
    # Set up the environment
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=num_levels,
                               set_seed=seed)
    env = make('standard', num_envs=1)
    if RENDER_SCREEN and not IN_PYNB:
        env.render()

    # Reset the environment
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape returned from AI gym.
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape
    print("screen size: ", screen_height, screen_width)

    # Are we resuming from an existing model?
    policy_net = None
    if load_filename is not None and os.path.isfile(load_filename):
        print("Loading model...")
        policy_net = torch.load(load_filename)
        policy_net = policy_net.to(DEVICE)
        print("Done loading.")
    else:
        print("Making new model.")
        policy_net = DQN(screen_height, screen_width,
                         env.NUM_ACTIONS).to(DEVICE)
    # Make a copy of the policy network for evaluation purposes
    eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE)
    eval_net.load_state_dict(policy_net.state_dict())
    eval_net.eval()

    # Instantiate the optimizer
    optimizer = None
    if len(list(policy_net.parameters())) > 0:
        optimizer = initializeOptimizer(policy_net.parameters())

    # Instantiate the replay memory
    replay_memory = ReplayMemory(replay_capacity)

    steps_done = 0  # How many steps have been run
    eval_window = []  # Keep the last 5 episode durations
    best_window = float('inf')  # The best average window duration to date

    ### Do training until episodes complete or until ^C is pressed
    try:
        print("training...")
        i_episode = 0  # The episode number

        # Stop when we reach max episodes
        while i_episode < num_episodes:
            print("episode:", i_episode, "epsilon:", epsilon)
            max_reward = 0  # The best reward we've seen this episode
            done = False  # Has the game ended (timed out or got the coin)
            episode_steps = 0  # Number of steps performed in this episode
            # Initialize the environment and state
            env.reset()

            # Current screen. There is no last screen because we get velocity on the screen itself.
            state = get_screen(env)

            # Do forever until the loop breaks
            while not done:
                # Select and perform an action
                action, epsilon = select_action(state, policy_net,
                                                env.NUM_ACTIONS, epsilon,
                                                steps_done,
                                                bootstrap_threshold)
                steps_done = steps_done + 1
                episode_steps = episode_steps + 1

                # for debugging
                if RENDER_SCREEN and not IN_PYNB:
                    env.render()

                # Run the action in the environment
                if action is not None:
                    _, reward, done, _ = env.step(np.array([action.item()]))

                    # Record if this was the best reward we've seen so far
                    max_reward = max(reward, max_reward)

                    # Turn the reward into a tensor
                    reward = torch.tensor([reward], device=DEVICE)

                    # Observe new state
                    current_screen = get_screen(env)

                    # Did the game end?
                    if not done:
                        next_state = current_screen
                    else:
                        next_state = None

                    # Store the transition in memory
                    replay_memory.push(state, action, next_state, reward)

                    # Move to the next state
                    state = next_state

                    # If we are past bootstrapping we should perform one step of the optimization
                    if steps_done > bootstrap_threshold:
                        optimize_model(policy_net, replay_memory, optimizer,
                                       batch_size, gamma)
                else:
                    # Do nothing if select_action() is not implemented and returning None
                    env.step(np.array([0]))

                # If we are done, print some statistics
                if done:
                    print("duration:", episode_steps)
                    print("max reward:", max_reward)
                    print("total steps:", steps_done)

            # Should we evaluate?
            if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0:
                test_average_duration = 0  # Track the average eval duration
                test_average_max_reward = 0  # Track the average max reward
                # copy all the weights into the evaluation network
                eval_net.load_state_dict(policy_net.state_dict())
                # Evaluate 10 times
                for _ in range(10):
                    # Call the evaluation function
                    test_duration, test_max_reward = evaluate(
                        eval_net, eval_epsilon, env)
                    test_average_duration = test_average_duration + test_duration
                    test_average_max_reward = test_average_max_reward + test_max_reward
                test_average_duration = test_average_duration / 10
                test_average_max_reward = test_average_max_reward / 10
                print("Average duration:", test_average_duration)
                print("Average max reward:", test_average_max_reward)
                # Append to the evaluation window
                if len(eval_window) < 5:
                    eval_window.append(test_average_duration)
                else:
                    eval_window = eval_window[1:] + [test_average_duration]
                # Compute window average
                window_average = sum(eval_window) / len(eval_window)
                print("evaluation window:", eval_window, "window average:",
                      window_average)
                # If this is the best window average we've seen, save the model
                if len(eval_window) >= 5 and window_average < best_window:
                    best_window = window_average
                    if save_filename is not None:
                        print("Saving model...")
                        torch.save(policy_net, save_filename)
                        print("Done saving.")
            # Only increment episode number if we are done with bootstrapping
            if steps_done > bootstrap_threshold:
                i_episode = i_episode + 1
        print('Training complete')
    except KeyboardInterrupt:
        print("Training interrupted")
    if RENDER_SCREEN and not IN_PYNB:
        env.render()
    env.close()
    return policy_net