def Train(): setup_utils.setup_and_load(use_cmd_line_args=False, set_seed=3, num_levels=1, use_black_white=True, frame_stack=4) # env=make("platform",num_envs=8) env = make("platform", num_envs=8) env = CourierWrapper(env, True) env = MyReward(env) # env = VecMonitor(env) learning_rate = 3e-4 clip_range = 0.2 n_timesteps = int(1e8) hyperparmas = { 'nsteps': 256, 'noptepochs': 4, 'nminibatches': 8, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.01 } act = ppo2.learn( network=MyPolicy, env=env, total_timesteps=n_timesteps, **hyperparmas, save_interval=100, log_interval=20, # value_network="copy" )
def create_env(env_name, flags): if env_name.startswith('Coin'): from coinrun import coinrunenv from coinrun import setup_utils as coinrun_setup_utils coinrun_setup_utils.setup_and_load( use_cmd_line_args=False, set_statics=flags.set_statics, set_dynamics=flags.set_dynamics, num_levels=flags.num_levels, any_custom_game=flags.any_custom_game, use_pytorch=True, paint_vel_info=0, is_high_res=flags.is_high_res, default_zoom=flags.default_zoom, float_obs=False) # torchbeast divides by 255 return CoinRunOneEnv('platform', 1, default_zoom=flags.default_zoom, float_obs=False) else: return atari_wrappers.wrap_pytorch( atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), clip_rewards=False, frame_stack=True, scale=False))
def test_coinrun(): setup_utils.setup_and_load(use_cmd_line_args=False) env = make('CoinRun-v0', num_envs=16) for _ in range(100): acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) _obs, _rews, _dones, _infos = env.step(acts) env.close()
def main(): setup_utils.setup_and_load(paint_vel_info=0, use_cmd_line_args=True) print("""Control with arrow keys, F1, F2 -- switch resolution, F5, F6, F7, F8 -- zoom, F9 -- switch reconstruction target picture, F10 -- switch lasers """) lib.test_main_loop()
def random_agent(num_envs=1, max_steps=100000): setup_utils.setup_and_load(use_cmd_line_args=False) env = make('standard', num_envs=num_envs) for step in range(max_steps): acts = np.array( [env.action_space.sample() for _ in range(env.num_envs)]) _obs, rews, _dones, _infos = env.step(acts) print("step", step, "rews", rews) env.close()
def __init__(self): self.AE = AutoEncoder(args, latent_dim=args.latent_dim).double().to(device) self.AE.train() self.counter = 0 self.buffer = np.empty(args.buffer_capacity, dtype=transition) setup_utils.setup_and_load(use_cmd_line_args=False) self.env = make('standard', num_envs=args.num_envs) self.optimizer = optim.Adam(self.AE.parameters(), lr=args.lr) self.criterion = nn.MSELoss() self.step = 0
def main(): utils.setup_mpi_gpus() setup_utils.setup_and_load(num_levels=0, starting_level=0, paint_vel_info=1, restore_id='start0numlev250_256mts', train_eval =True, test_eval = False, num_eval=100, high_difficulty=False) print("High difficulty: " + str(Config.HIGH_DIFFICULTY)) frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options) nogpu_config = tf.ConfigProto(device_count = {'GPU': 0}) with tf.Session(config=nogpu_config) as sess: #with tf.Session(config=frac_gpu_config) as sess: enjoy_env_sess(sess)
def create_coinrun_env(num_levels, task_id, random_seed_list): # setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) try: random_seed = random_seed_list[task_id] except: random_seed = 123 setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=random_seed) env = make('standard', num_envs=1) return env
def testing(): setup_utils.setup_and_load() episodes = 10 env = Scalarize(make('standard', num_envs=1)) for i in range(episodes): env.reset() while True: env.render() action = np.random.randint(0, env.action_space.n) next_state, reward, done, info = env.step(action) if done or reward > 0: break
def random_agent(num_envs=1, max_steps=100000): setup_utils.setup_and_load(use_cmd_line_args=True) print(Config.IS_HIGH_RES) env = make('standard', num_envs=num_envs) env.render() viewer = rendering.SimpleImageViewer() for step in range(max_steps): acts = np.array( [env.action_space.sample() for _ in range(env.num_envs)]) _obs, rews, _dones, _infos = env.step(acts) print("step", step, "rews", rews) env.render() env.close()
def make_coinrun(): from coinrun import setup_utils, make from coinrun_wrapper import CourierWrapper, MyReward setup_utils.setup_and_load(use_cmd_line_args=False, set_seed=3, num_levels=1, use_black_white=True, frame_stack=4) # env=make("platform",num_envs=8) env = make("platform", num_envs=256) env = CourierWrapper(env, False) env = MyReward(env) return env
def main(): utils.setup_mpi_gpus() setup_utils.setup_and_load() DIR_NAME = Config.TEST_LOG_NAME if not os.path.exists(DIR_NAME): os.makedirs(DIR_NAME) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: results = enjoy_env_sess(sess, DIR_NAME) print(results)
def main(): setup_utils.setup_and_load(use_cmd_line_args=False) # Make the base enviroments that we will train the agent on first, this makes 1 gym enviroment # But after each epoch a different enviroment will be choosen, currently we only use 1 enviroment, # because it works better with the dqn algorithm base_env = make('standard', num_envs=1) base_env = CoinRunVecEnvWrapper(base_env) #base_env = wrappers.add_final_wrappers(base_env) # Make the enviroment that we will attempt to transfer to transfer_enviroment = make('standard', num_envs=1) transfer_enviroment = CoinRunVecEnvWrapper(transfer_enviroment) t = int(5e3) with tf.Session(): model = make_model() print("-----\ntraining base model on training enviroment\n-----") base_statistics = run_deepq(model if model else 'cnn', base_env, total_timesteps=t, name="base") print('mean reward: ', np.mean(np.array(base_statistics['rewards']))) print("-----\ntraining transfer model on test enviroment\n-----") transfer_statistics = run_deepq(model if model else 'cnn', transfer_enviroment, total_timesteps=t, name="transfer") print('mean reward: ', np.mean(np.array(transfer_statistics['rewards']))) model = make_model() print("-----\ntraining non-transfer model on test enviroment\n-----") transfer_enviroment_base_model_statistics = run_deepq( model if model else 'cnn', transfer_enviroment, total_timesteps=t, name="transfer") print( 'mean reward: ', np.mean( np.array( transfer_enviroment_base_model_statistics['rewards']))) plot_stats(base_statistics, transfer_statistics, transfer_enviroment_base_model_statistics)
def random_agent(num_envs=1, max_steps=100000): #random environment # setup_utils.setup_and_load(use_cmd_line_args=False) #just test in level1 with config --run-id myrun --num-levels 1 setup_utils.setup_and_load() env = make('standard', num_envs=num_envs) imgNum = 0 for step in range(100000): env.render() #acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) foo = [1, 3] acts = np.array([random.choice(foo)]) #0: no move #1:right move #2: move but stay #3:jump #4:down #5:down #6:down # 0, 0, # +1, 0, // right # -1, 0, // left # 0, +1, // jump # +1, +1, // right - jump # -1, +1, // left - jump # 0, -1, // down(step down from a crate) print("python input action: ", acts) print("\n env.step(acts): \n") _obs, rews, _dones, _infos = env.step(acts) #todo:return distance (change _obs to distance) then condition img_input = img.imgbuffer_process(_obs, (256, 256)) if step % 50 == 0: #turn gray #todo:make coinrunMOXCS consume gray img #plt.imsave('%i.jpg' % (imgNum), img_input.mean(axis=2), cmap = "gray") # plt.imsave('%i.jpg' % (imgNum), img_input) #plt.imshow(img_input.mean(axis=2), cmap="gray") imgNum = imgNum + 1 print("imgNum:%i" % (imgNum)) print("step", step, "rews", rews) env.close()
def main(): args = setup_utils.setup_and_load() setup_utils.load_for_setup_if_necessary() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes=" baseline train", tags=["baseline", Config.RUN_ID.split('-')[0]], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu') utils.mpi_print(args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs is how many envs run parallel on a cpu # VenEnv class allows parallel rollout nenvs = Config.NUM_ENVS total_timesteps = int(256 * 10**6) env = utils.make_general_env(nenvs, seed=rank) utils.mpi_print('Set up env') with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies_back.get_policy() #policy = policies.get_policy() utils.mpi_print('Set up policy') learn_func(policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, total_timesteps=total_timesteps)
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, allow_early_resets, num_frame_stack=None, coin_run_level=0, coin_run_seed=-1, difficulty=False): # coinrun environments need to be treated differently. coinrun_envs = { 'CoinRun': 'standard', 'CoinRun-Platforms': 'platform', 'Random-Mazes': 'maze' } if env_name in coinrun_envs: coin_run_args = setup_utils.setup_and_load(use_cmd_line_args=False) Coinrun_Config.GAME_TYPE = coinrun_envs[env_name] Coinrun_Config.NUM_LEVELS = coin_run_level Coinrun_Config.SET_SEED = coin_run_seed # If SET_SEED = -1, this seed is not used and level seeds will be drawn from the # range [0, NUM_LEVELS). Use SET_SEED = -1 and NUM_LEVELS = 500 to train with the same levels in the paper. Coinrun_Config.NUM_ENVS = num_processes Coinrun_Config.HIGH_DIFFICULTY = difficulty envs = coinrun_utils.make_general_env(num_processes) envs.spec = Coinrun_Config.GAME_TYPE envs = CoinRunVecPyTorch(envs, device) envs = add_final_pytorch_wrappers(envs) else: envs = [ make_env(env_name, seed, i, log_dir, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def main(): """The main function.""" setup_utils.setup_and_load(is_high_res=True) config, unparsed = dqnconfig.get_config() # ---------------------------------------- # Parse configuration # If we have unparsed arguments, print usage and exit if len(unparsed) > 0: print("unparsed for DQN :", unparsed) # input("Press Enter to continue...") if config.mode == "train": train(config) elif config.mode == "test": test(config) else: raise ValueError("Unknown run mode \"{}\"".format(config.mode))
def __init__(self, hparams): # only support 1 environment currently super().__init__(hparams) try: from coinrun import setup_utils, make setup_utils.setup_and_load(use_cmd_line_args=False) self._env = make('standard', num_envs=1) except ImportError as e: print(e) print("please check README for CoinRun installation instruction") exit() self.seed(1234) self._observation_space = self._env.observation_space self._action_space = self._env.action_space self._hparams.num_states = self._observation_space.shape[0] self._hparams.num_actions = self._action_space.n self._hparams.state_shape = list(self._observation_space.shape) self._hparams.action_space_type = self._action_space.__class__.__name__ self._hparams.pixel_input = True if self._hparams.reward_augmentation is not None: self._reward_augmentation = get_reward_augmentation( self._hparams.reward_augmentation)
def main(): args = setup_utils.setup_and_load(num_levels=250, starting_level=0, paint_vel_info=1, run_id='start0numlev250_256mts_dann_low', num_envs=32) comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() #config = tf.ConfigProto() frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options) nogpu_config = tf.ConfigProto(device_count={'GPU': 0}) #config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS print("Num envs: " + str(Config.NUM_ENVS)) total_timesteps = int(256e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=frac_gpu_config): #with tf.Session(config=nogpu_config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes="network randomization", tags=["baseline"], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(256e6) env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = nr_policies.get_policy() nr_ppo2.learn(policy=policy, env=env, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(200e6) elif Config.SHORT_TRAINING: total_timesteps = int(120e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f : f * Config.LEARNING_RATE, cliprange=lambda f : f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) main_utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank)) print("load path:") print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID)) act = deepq.learn( env, network="conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], total_timesteps=0, load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID) # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID) ) num_episodes = 500 # while True: episode_rew_ls = [] for i in range(num_episodes): obs, done = env.reset(), False episode_rew = 0 while not done: if Config.RENDER: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew episode_rew_ls.append(episode_rew) print("Episode reward", episode_rew) print("Avg episode reward", np.mean(episode_rew_ls)) print("Var episode reward", np.std(episode_rew_ls))
def main(): # load from restore file args_dict = utils.load_args() # train args of restore id test_args = setup_utils.setup_and_load() if 'NR' in Config.RESTORE_ID: Config.USE_LSTM = 2 if 'dropout' in Config.RESTORE_ID: Config.DROPOUT = 0 Config.USE_BATCH_NORM = 0 wandb.init(project="coinrun", notes="test", tags=["baseline", "test"], config=Config.get_args_dict()) config = tf.ConfigProto() config.gpu_options.allow_growth = True seed = np.random.randint(100000) Config.SET_SEED = seed overlap = { 'set_seed': Config.SET_SEED, 'rep': Config.REP, 'highd': Config.HIGH_DIFFICULTY, 'num_levels': Config.NUM_LEVELS, 'use_lstm': Config.USE_LSTM, 'dropout': Config.DROPOUT, 'use_batch_norm': Config.USE_BATCH_NORM } load_file = Config.get_load_filename(restore_id=Config.RESTORE_ID) mpi_print('load file name', load_file) mpi_print('seed', seed) mpi_print("---------------------------------------") for checkpoint in range(1, 33): with tf.Session() as sess: steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) enjoy_env_sess(sess, checkpoint, overlap)
from utils import * from collections import deque import gym import cv2 import os import coinrun.main_utils as utils from coinrun import setup_utils, policies, wrappers, ppo2 from coinrun.config import Config #from gym.envs.classic_control import rendering from collections import deque import random from image_bco import ImageBCO utils.setup_mpi_gpus() setup_utils.setup_and_load() game = utils.make_general_env(1) game = wrappers.add_final_wrappers(game) game.reset() args.checkpoint = 'coin_ilpo' args.input_dir = 'final_models/coin' args.exp_dir = 'results/final_coin_bco' args.n_actions = 4 args.real_actions = 4 args.policy_lr = .0001 args.batch_size = 100 args.ngf = 15 states = [] next_states = [] FINAL_EPSILON = .2 # final value of epsilon
def main(): utils.setup_mpi_gpus() setup_utils.setup_and_load() with tf.Session() as sess: enjoy_env_sess(sess)
def enjoy_env_sess(): # utils.setup_mpi_gpus() # setup_utils.setup_and_load({'restore_id': collecting_model}) directory = './images/' directory_saliency = "./images_saliency" def create_saliency(model_idx, sess): graph = tf.get_default_graph() env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) agent = create_act_model(sess, env, 1) action_selector = tf.placeholder(tf.int32) gradient_saliency = saliency.GradientSaliency( graph, sess, agent.pd.logits[0][action_selector], agent.X) sess.run(tf.global_variables_initializer()) # setup_utils.restore_file(models[model_idx]) try: loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') except AssertionError as e: models[model_idx] = None return [None] * 3 return agent, gradient_saliency, action_selector orig_images_low = [] orig_images_high = [] filenames = [] print("Loading files...") for idx, filename in enumerate(os.listdir(directory)): if len(filename) > 15 or os.path.isdir( os.path.join(directory, filename)): continue print('.', end='') img = imageio.imread(os.path.join(directory, filename)) img = img.astype(np.float32) if filename.startswith('img_') and len(filename) < 15: filenames.append(filename) list_to_append = orig_images_low if filename.startswith('imgL_') and len(filename) < 15: list_to_append = orig_images_high list_to_append.append(img) list_of_images_lists = [] # First one for 0 list_of_vmax_lists = [] for idx, model_name in enumerate(models): if model_name is None: list_of_images_lists.append(None) list_of_vmax_lists.append(None) continue model_images = [] vmaxs = [] config.Config = config.ConfigSingle() setup_utils.setup_and_load(use_cmd_line_args=False, restore_id=model_name, replay=True) print("\nComputing saliency for Model {}\{}: {}...".format( idx, len(models) - 1, names[model_name])) with tf.compat.v1.Session() as sess: agent, gradient_saliency, action_selector = create_saliency( idx, sess) for img in orig_images_low: print('.', end='') sys.stdout.flush() action, values, state, _ = agent.step(np.expand_dims(img, 0), agent.initial_state, False) s_vanilla_mask_3d = gradient_saliency.GetSmoothedMask( img, feed_dict={ 'model/is_training_:0': False, action_selector: action[0] }) s_vanilla_mask_grayscale, vmax = saliency.VisualizeImageGrayscale( s_vanilla_mask_3d) model_images.append(s_vanilla_mask_grayscale) vmaxs.append(vmax) list_of_images_lists.append(model_images) list_of_vmax_lists.append(vmaxs) print("\nMaking pretty images..") for idx, filename in enumerate(filenames): print('.', end='') sys.stdout.flush() P.figure(figsize=(COLS * UPSCALE_FACTOR, ROWS * UPSCALE_FACTOR)) ShowImage(orig_images_high[idx] / 255, title="Original", ax=P.subplot(ROWS, COLS, 1)) for row in range(ROWS): for col in range(COLS): model_idx = col + row * COLS if models[model_idx] is None: continue ShowGrayscaleImage(list_of_images_lists[model_idx][idx], title=names[models[model_idx]] + " Vmax: {:.2E}".format( list_of_vmax_lists[model_idx][idx]), ax=P.subplot(ROWS, COLS, model_idx + 1)) P.savefig( os.path.join(directory_saliency, filename[:-4] + "_saliency.png")) P.close() print("\nDone")
import numpy as np from coinrun import setup_utils, make config_args = setup_utils.setup_and_load(use_cmd_line_args=False) env = make('standard', num_envs=4) for _ in range(1000): env.render() acts = np.array([env.action_space.sample() for _ in range(env.num_envs)]) _obs, _rews, _dones, _infos = env.step(acts) env.close()
def train(num_episodes=NUM_EPISODES, load_filename=None, save_filename=None, eval_interval=EVAL_INTERVAL, replay_capacity=REPLAY_CAPACITY, bootstrap_threshold=BOOTSTRAP, epsilon=EPSILON, eval_epsilon=EVAL_EPSILON, gamma=GAMMA, batch_size=BATCH_SIZE, target_update=TARGET_UPDATE, random_seed=RANDOM_SEED, num_levels=NUM_LEVELS, seed=SEED): # Set the random seed if random_seed is not None: random.seed(random_seed) torch.manual_seed(random_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(RANDOM_SEED) # Set up the environment setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) env = make('standard', num_envs=1) if RENDER_SCREEN and not IN_PYNB: env.render() # Reset the environment env.reset() # Get screen size so that we can initialize layers correctly based on shape returned from AI gym. init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape print("screen size: ", screen_height, screen_width) # Are we resuming from an existing model? policy_net = None if load_filename is not None and os.path.isfile( os.path.join(MODEL_PATH, load_filename)): print("Loading model...") policy_net = load_model(load_filename) policy_net = policy_net.to(DEVICE) print("Done loading.") else: print("Making new model.") policy_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) # Make a copy of the policy network for evaluation purposes eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) eval_net.load_state_dict(policy_net.state_dict()) eval_net.eval() # Target network is a snapshot of the policy network that lags behind (for stablity) target_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Instantiate the optimizer optimizer = None if len(list(policy_net.parameters())) > 0: optimizer = initializeOptimizer(policy_net.parameters()) # Instantiate the replay memory replay_memory = ReplayMemory(replay_capacity) steps_done = 0 # How many steps have been run best_eval = float('inf') # The best model evaluation to date # Do training until episodes complete print("training...") i_episode = 0 # The episode number # Stop when we reach max episodes while i_episode < num_episodes: print("episode:", i_episode, "epsilon:", epsilon) max_reward = 0 # The best reward we've seen this episode done = False # Has the game ended (timed out or got the coin) episode_steps = 0 # Number of steps performed in this episode # Initialize the environment and state env.reset() # Current screen. There is no last screen because we get velocity on the screen itself. state = get_screen(env) # Do forever until the loop breaks while not done: # Select and perform an action action, epsilon = select_action(state, policy_net, env.NUM_ACTIONS, epsilon, steps_done, bootstrap_threshold) steps_done = steps_done + 1 episode_steps = episode_steps + 1 # for debugging if RENDER_SCREEN and not IN_PYNB: env.render() # Run the action in the environment if action is not None: _, reward, done, _ = env.step(np.array([action.item()])) # Record if this was the best reward we've seen so far max_reward = max(reward, max_reward) # Turn the reward into a tensor reward = torch.tensor([reward], device=DEVICE) # Observe new state current_screen = get_screen(env) # Did the game end? if not done: next_state = current_screen else: next_state = None # Store the transition in memory replay_memory.push(state, action, next_state, reward) # Move to the next state state = next_state # If we are past bootstrapping we should perform one step of the optimization if steps_done > bootstrap_threshold: optimize_model( policy_net, target_net if target_update > 0 else policy_net, replay_memory, optimizer, batch_size, gamma) else: # Do nothing if select_action() is not implemented and returning None env.step(np.array([0])) # If we are done, print some statistics if done: print("duration:", episode_steps) print("max reward:", max_reward) status, _ = episode_status(episode_steps, max_reward) print("result:", status) print("total steps:", steps_done, '\n') # Should we update the target network? if target_update > 0 and i_episode % target_update == 0: target_net.load_state_dict(policy_net.state_dict()) # Should we evaluate? if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0: test_average_duration = 0 # Track the average eval duration test_average_max_reward = 0 # Track the average max reward # copy all the weights into the evaluation network eval_net.load_state_dict(policy_net.state_dict()) # Evaluate 10 times for _ in range(EVAL_COUNT): # Call the evaluation function test_duration, test_max_reward = evaluate( eval_net, eval_epsilon, env) status, score = episode_status(test_duration, test_max_reward) test_duration = score # Set test_duration to score to factor in death-penalty test_average_duration = test_average_duration + test_duration test_average_max_reward = test_average_max_reward + test_max_reward test_average_duration = test_average_duration / EVAL_COUNT test_average_max_reward = test_average_max_reward / EVAL_COUNT print("Average duration:", test_average_duration) print("Average max reward:", test_average_max_reward) # If this is the best window average we've seen, save the model if test_average_duration < best_eval: best_eval = test_average_duration if save_filename is not None: save_model(policy_net, save_filename, i_episode) print(' ') # Only increment episode number if we are done with bootstrapping if steps_done > bootstrap_threshold: i_episode = i_episode + 1 print('Training complete') if RENDER_SCREEN and not IN_PYNB: env.render() env.close() return policy_net
def evaluate(policy_net, epsilon=EVAL_EPSILON, env=None, test_seed=SEED): setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=NUM_LEVELS, set_seed=test_seed) # Make an environment if we don't already have one if env is None: env = make('standard', num_envs=1) if RENDER_SCREEN and not IN_PYNB: env.render() # Reset the environment env.reset() # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape # Get the network ready for evaluation (turns off some things like dropout if used) policy_net.eval() # Current screen. There is no last screen state = get_screen(env) steps_done = 0 # Number of steps executed max_reward = 0 # Max reward seen done = False # Is the game over? print("Evaluating...") while not done: # Select and perform an action action, _ = select_action(state, policy_net, env.NUM_ACTIONS, epsilon, steps_done=0, bootstrap_threshold=0) steps_done = steps_done + 1 if RENDER_SCREEN and not IN_PYNB: env.render() # Execute the action if action is not None: _, reward, done, _ = env.step(np.array([action.item()])) # Is this the best reward we've seen? max_reward = max(reward, max_reward) # Observe new state state = get_screen(env) else: # Do nothing if select_action() is not implemented and returning None env.step(np.array([0])) print("duration:", steps_done) print("max reward:", max_reward) status, _ = episode_status(steps_done, max_reward) print("result:", status, '\n') if RENDER_SCREEN and not IN_PYNB: env.render() return steps_done, max_reward
def train(num_episodes=NUM_EPISODES, load_filename=None, save_filename=None, eval_interval=EVAL_INTERVAL, replay_capacity=REPLAY_CAPACITY, bootstrap_threshold=BOOTSTRAP, epsilon=EPSILON, eval_epsilon=EVAL_EPSILON, gamma=GAMMA, batch_size=BATCH_SIZE, num_levels=NUM_LEVELS, seed=SEED): # Set up the environment setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed) env = make('standard', num_envs=1) if RENDER_SCREEN and not IN_PYNB: env.render() # Reset the environment env.reset() # Get screen size so that we can initialize layers correctly based on shape returned from AI gym. init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape print("screen size: ", screen_height, screen_width) # Are we resuming from an existing model? policy_net = None if load_filename is not None and os.path.isfile(load_filename): print("Loading model...") policy_net = torch.load(load_filename) policy_net = policy_net.to(DEVICE) print("Done loading.") else: print("Making new model.") policy_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) # Make a copy of the policy network for evaluation purposes eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE) eval_net.load_state_dict(policy_net.state_dict()) eval_net.eval() # Instantiate the optimizer optimizer = None if len(list(policy_net.parameters())) > 0: optimizer = initializeOptimizer(policy_net.parameters()) # Instantiate the replay memory replay_memory = ReplayMemory(replay_capacity) steps_done = 0 # How many steps have been run eval_window = [] # Keep the last 5 episode durations best_window = float('inf') # The best average window duration to date ### Do training until episodes complete or until ^C is pressed try: print("training...") i_episode = 0 # The episode number # Stop when we reach max episodes while i_episode < num_episodes: print("episode:", i_episode, "epsilon:", epsilon) max_reward = 0 # The best reward we've seen this episode done = False # Has the game ended (timed out or got the coin) episode_steps = 0 # Number of steps performed in this episode # Initialize the environment and state env.reset() # Current screen. There is no last screen because we get velocity on the screen itself. state = get_screen(env) # Do forever until the loop breaks while not done: # Select and perform an action action, epsilon = select_action(state, policy_net, env.NUM_ACTIONS, epsilon, steps_done, bootstrap_threshold) steps_done = steps_done + 1 episode_steps = episode_steps + 1 # for debugging if RENDER_SCREEN and not IN_PYNB: env.render() # Run the action in the environment if action is not None: _, reward, done, _ = env.step(np.array([action.item()])) # Record if this was the best reward we've seen so far max_reward = max(reward, max_reward) # Turn the reward into a tensor reward = torch.tensor([reward], device=DEVICE) # Observe new state current_screen = get_screen(env) # Did the game end? if not done: next_state = current_screen else: next_state = None # Store the transition in memory replay_memory.push(state, action, next_state, reward) # Move to the next state state = next_state # If we are past bootstrapping we should perform one step of the optimization if steps_done > bootstrap_threshold: optimize_model(policy_net, replay_memory, optimizer, batch_size, gamma) else: # Do nothing if select_action() is not implemented and returning None env.step(np.array([0])) # If we are done, print some statistics if done: print("duration:", episode_steps) print("max reward:", max_reward) print("total steps:", steps_done) # Should we evaluate? if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0: test_average_duration = 0 # Track the average eval duration test_average_max_reward = 0 # Track the average max reward # copy all the weights into the evaluation network eval_net.load_state_dict(policy_net.state_dict()) # Evaluate 10 times for _ in range(10): # Call the evaluation function test_duration, test_max_reward = evaluate( eval_net, eval_epsilon, env) test_average_duration = test_average_duration + test_duration test_average_max_reward = test_average_max_reward + test_max_reward test_average_duration = test_average_duration / 10 test_average_max_reward = test_average_max_reward / 10 print("Average duration:", test_average_duration) print("Average max reward:", test_average_max_reward) # Append to the evaluation window if len(eval_window) < 5: eval_window.append(test_average_duration) else: eval_window = eval_window[1:] + [test_average_duration] # Compute window average window_average = sum(eval_window) / len(eval_window) print("evaluation window:", eval_window, "window average:", window_average) # If this is the best window average we've seen, save the model if len(eval_window) >= 5 and window_average < best_window: best_window = window_average if save_filename is not None: print("Saving model...") torch.save(policy_net, save_filename) print("Done saving.") # Only increment episode number if we are done with bootstrapping if steps_done > bootstrap_threshold: i_episode = i_episode + 1 print('Training complete') except KeyboardInterrupt: print("Training interrupted") if RENDER_SCREEN and not IN_PYNB: env.render() env.close() return policy_net