def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, allow_early_resets, num_frame_stack=None, coin_run_level=0, coin_run_seed=-1, difficulty=False): # coinrun environments need to be treated differently. coinrun_envs = { 'CoinRun': 'standard', 'CoinRun-Platforms': 'platform', 'Random-Mazes': 'maze' } if env_name in coinrun_envs: coin_run_args = setup_utils.setup_and_load(use_cmd_line_args=False) Coinrun_Config.GAME_TYPE = coinrun_envs[env_name] Coinrun_Config.NUM_LEVELS = coin_run_level Coinrun_Config.SET_SEED = coin_run_seed # If SET_SEED = -1, this seed is not used and level seeds will be drawn from the # range [0, NUM_LEVELS). Use SET_SEED = -1 and NUM_LEVELS = 500 to train with the same levels in the paper. Coinrun_Config.NUM_ENVS = num_processes Coinrun_Config.HIGH_DIFFICULTY = difficulty envs = coinrun_utils.make_general_env(num_processes) envs.spec = Coinrun_Config.GAME_TYPE envs = CoinRunVecPyTorch(envs, device) envs = add_final_pytorch_wrappers(envs) else: envs = [ make_env(env_name, seed, i, log_dir, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def main(): args = setup_utils.setup_and_load() setup_utils.load_for_setup_if_necessary() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes=" baseline train", tags=["baseline", Config.RUN_ID.split('-')[0]], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu') utils.mpi_print(args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs is how many envs run parallel on a cpu # VenEnv class allows parallel rollout nenvs = Config.NUM_ENVS total_timesteps = int(256 * 10**6) env = utils.make_general_env(nenvs, seed=rank) utils.mpi_print('Set up env') with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies_back.get_policy() #policy = policies.get_policy() utils.mpi_print('Set up policy') learn_func(policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load(num_levels=250, starting_level=0, paint_vel_info=1, run_id='start0numlev250_256mts_dann_low', num_envs=32) comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() #config = tf.ConfigProto() frac_gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) frac_gpu_config = tf.ConfigProto(gpu_options=frac_gpu_options) nogpu_config = tf.ConfigProto(device_count={'GPU': 0}) #config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS print("Num envs: " + str(Config.NUM_ENVS)) total_timesteps = int(256e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=frac_gpu_config): #with tf.Session(config=nogpu_config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print('size', size) # For wandb package to visualize results curves config = Config.get_args_dict() wandb.init(project="coinrun", notes="network randomization", tags=["baseline"], config=config) seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(256e6) env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = nr_policies.get_policy() nr_ppo2.learn(policy=policy, env=env, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def create_saliency(model_idx, sess): graph = tf.get_default_graph() env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) agent = create_act_model(sess, env, 1) action_selector = tf.placeholder(tf.int32) gradient_saliency = saliency.GradientSaliency(graph, sess, agent.pd.logits[0][action_selector], agent.X) sess.run(tf.compat.v1.global_variables_initializer()) # setup_utils.restore_file(models[model_idx]) try: loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') except AssertionError as e: models[model_idx] = None return [None]*3 return agent, gradient_saliency, action_selector
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) main_utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank)) print("load path:") print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID)) act = deepq.learn( env, network="conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], total_timesteps=0, load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID) # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID) ) num_episodes = 500 # while True: episode_rew_ls = [] for i in range(num_episodes): obs, done = env.reset(), False episode_rew = 0 while not done: if Config.RENDER: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew episode_rew_ls.append(episode_rew) print("Episode reward", episode_rew) print("Avg episode reward", np.mean(episode_rew_ls)) print("Var episode reward", np.std(episode_rew_ls))
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 nenvs = Config.NUM_ENVS total_timesteps = int(160e6) if Config.LONG_TRAINING: total_timesteps = int(200e6) elif Config.SHORT_TRAINING: total_timesteps = int(120e6) save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=0.95, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, log_interval=1, ent_coef=Config.ENTROPY_COEFF, lr=lambda f : f * Config.LEARNING_RATE, cliprange=lambda f : f * 0.2, total_timesteps=total_timesteps)
def enjoy_env_sess(sess): should_render = True should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL rep_count = Config.REP if should_eval: env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs agent = create_act_model(sess, env, nenvs) sess.run(tf.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] if t_step % 100 == 0: mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0], np.shape(obs)) maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = 0 if should_eval: mean_score = np.mean(scores) / rep_count max_idx = np.argmax(scores) mpi_print('scores', scores / rep_count) print('mean_score', mean_score) mpi_print('max idx', max_idx) mpi_mean_score = utils.mpi_average([mean_score]) mpi_print('mpi_mean', mpi_mean_score) result = mean_score return result
def main(): print("line 1...") args = setup_utils.setup_and_load() print("passed line 1...") comm = MPI.COMM_WORLD print("passed line 2...") rank = comm.Get_rank() #the rank of the process in a communicator print("passed line 3...") seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) print("passed line 4,5...") # utils.setup_mpi_gpus() print("passed line 6...") config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 # nenvs = Config.NUM_ENVS nenvs = 1 #set to 1 temporarily # frame_stack_size = Config.FRAME_STACK_SIZE frame_stack_size = 1 total_timesteps = int(5e6) save_interval = args.save_interval env_id = "MsPacman-v0" #copy from https://github.com/openai/baselines/blob/52255beda5f5c8760b0ae1f676aa656bb1a61f80/baselines/run.py#L33 _game_envs = defaultdict(set) for env in gym.envs.registry.all(): # TODO: solve this with regexes env_type = env._entry_point.split(':')[0].split('.')[-1] _game_envs[env_type].add(env.id) # env = make_vec_env(env_id, env_type, nenvs, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) """ save_interval = args.save_interval env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): env = wrappers.add_final_wrappers(env) policy = policies.get_policy() """ env = utils.make_general_env(env_id, env_type, nenvs, seed) # env = make_vec_env(env_id, env_type, nenvs, seed) # env = VecFrameStack(env, frame_stack_size) # env = utils.make_general_env(nenvs, seed=rank) with tf.Session(config=config): # env = wrappers.add_final_wrappers(env) #don't use wrappers anymore env = wrappers.add_final_wrappers(env) policy = policies.get_policy() # ppo2.learn(policy=policy, # env=env, # save_interval=save_interval, # nsteps=Config.NUM_STEPS, # nminibatches=Config.NUM_MINIBATCHES, # lam=0.95, # gamma=Config.GAMMA, # noptepochs=Config.PPO_EPOCHS, # log_interval=1, # ent_coef=Config.ENTROPY_COEFF, # lr=lambda f : f * Config.LEARNING_RATE, # cliprange=lambda f : f * 0.2, # total_timesteps=total_timesteps) ppo2.learn(policy=policy, env=env, save_interval=save_interval, nsteps=int(1000), nminibatches=100, lam=0.95, gamma=0.9, noptepochs=16, log_interval=1, ent_coef=0.1, lr=lambda f: f * 3e-4, cliprange=lambda f: f * 0.2, total_timesteps=total_timesteps)
def main(): # set seed args = setup_utils.setup_and_load() print("args are") print(args) comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = 0 set_global_seeds(seed * 100 + rank) # Initialize env nenvs = 1 env_init_size = nenvs env = utils.make_general_env(nenvs, seed=rank) # check and use GPU if available if not use CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device is {}".format(device)) # wrap env (not needed with Coinrun options) #env = dqn_utils.wrap_deepmind(env, clip_rewards=False, frame_stack=True, scale=False) action_size = env.action_space.n #env.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # hyperparameters timesteps = 1000000 #2000000 #1000#2000000 # run env for this many time steps hidden_size = 512 # side of hidden layer of FFNN that connects CNN to outputs learning_rate = 0.0001 # learning rate of optimizer batch_size = 32 # size of batch trained on #start_training_after = 50 #10000 # start training NN after this many timesteps discount = 0.99 # discount future states by is_dueling = True is_impala_net = False frame_skip = 4 #hold action for this many frames # create DQN Agent dqn_agent = dqn_utils.DQNAgent(action_size, hidden_size, learning_rate, is_dueling, is_impala_net) train_time_id = 1587836568 test_time_id = int(time.time()) load_array = [] for i in range(20000, 250000, 20000): load_array.append(i) load_array.append('FINAL') # load agent for i in load_array: model_number = i PATH = "saved_models/dqn_model_{}_{}.pt".format( train_time_id, model_number) dqn_agent.train_net.load_state_dict(torch.load(PATH)) dqn_agent.train_net.eval() # training loop stats_rewards_list = [] # store stats for plotting in this stats_every = 1 # print stats every this many episodes total_reward = 0. episode = 1 episode_length = 0 stats_loss = 0. epsilon = 0.02 # can't call env.reset() in coinrun, so start each episode with a no acton state_list, _, _, _ = env.step(np.array([0], dtype=np.int32)) for ts in range(timesteps): # select an action from the agent's policy action = dqn_agent.select_action(state_list[0].squeeze(axis=-1), epsilon, env, batch_size) # enter action into the env for _ in range(frame_skip): next_state_list, reward_list, done_list, _ = env.step(action) total_reward += reward_list[0] if done_list[0]: break done = done_list[0] episode_length += 1 if done: state_list = env.reset() stats_rewards_list.append( (episode, total_reward, episode_length)) episode += 1 if episode >= 201: break if episode % stats_every == 0: print( 'Episode: {}'.format(episode), 'Timestep: {}'.format(ts), 'Episode reward {}'.format(total_reward), 'Episode len {}'.format(episode_length), 'Mean reward: {:.1f}'.format( np.mean(stats_rewards_list, axis=0)[1]), 'Mean length: {:.1f}'.format( np.mean(stats_rewards_list, axis=0)[2])) stats_loss = 0. total_reward = 0 episode_length = 0 else: state_list = next_state_list #save final stats stats_save_string = "saved_models/test_env_stats_{}_{}_{}.pickle".format( train_time_id, model_number, test_time_id) with open(stats_save_string, 'wb') as handle: pickle.dump(stats_rewards_list, handle)
def enjoy_env_sess(sess, DIR_NAME): should_render = True should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL rep_count = Config.REP file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID) f_io = open(file_name, 'a') if should_eval: if Config.TEST_NUM_EVAL > -1: env = utils.make_general_env(Config.TEST_NUM_EVAL) else: env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs vae = ConvVAE(z_size=Config.VAE_Z_SIZE, batch_size=nenvs, is_training=False, reuse=False, gpu_mode=True, use_coord_conv=True) agent = create_act_model(sess, env, nenvs, Config.VAE_Z_SIZE) num_actions = env.action_space.n init_rand = tf.variables_initializer( [v for v in tf.global_variables() if 'randcnn' in v.name]) sess.run(tf.compat.v1.global_variables_initializer()) soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions], name='soft_numpy') dist = tfp.distributions.Categorical(probs=soft_numpy) sampled_action = dist.sample() loaded_params = utils.load_params_for_scope(sess, 'model') vae.load_json_full(Config.VAE_PATH) if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) actions = [env.action_space.sample() for _ in range(nenvs)] actions = np.array(actions) obs, _, _, _ = env.step(actions) sess.run(init_rand) while should_continue(): #scipy.misc.imsave('raw_inputs.png', obs[0]) encoder_in = obs.astype(np.float32) / 255.0 batch_z = vae.encode(encoder_in) #reconstruct = vae.decode(batch_z) #scipy.misc.imsave('recon.png', reconstruct[0]) action, values, state, _ = agent.step(batch_z, state, done) obs, rew, done, info = env.step(action) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = 0 if should_eval: mean_score = np.mean(scores) / rep_count max_idx = np.argmax(scores) result = mean_score f_io.write("{}\n".format(result)) f_io.close() return result
def main(sess): comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 if Config.EXTRACT_SEED != -1: seed = Config.EXTRACT_SEED if Config.EXTRACT_RANK != -1: rank = Config.EXTRACT_RANK set_global_seeds(seed * 100 + rank) utils.setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 use_policy = (Config.RESTORE_ID != '') nenvs = Config.NUM_ENVS total_timesteps = int(502e6) env = utils.make_general_env(nenvs, seed=rank) if use_policy: agent = create_act_model(sess, env, nenvs) sess.run(tf.compat.v1.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') # make directory DIR_NAME = './VAE/records/' if not os.path.exists(DIR_NAME): os.makedirs(DIR_NAME, exist_ok=True) # set file name filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz" with tf.compat.v1.Session(config=config): env = wrappers.add_final_wrappers(env) nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) obs[:] = env.reset() dones = [False for _ in range(nenv)] # remove noisy inputs actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) obs[:], rewards, dones, _ = env.step(actions) state = agent.initial_state mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[] # For n in range number of steps for _ in range(400): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init if use_policy: actions, _, _, _ = agent.step(obs, state, dones) else: actions = [env.action_space.sample() for _ in range(nenv)] actions = np.array(actions) mb_obs.append(obs.copy()) mb_actions.append(actions) mb_dones.append(dones) # Take actions in env and look the results # Infos contains a ton of useful informations obs[:], rewards, dones, _ = env.step(actions) mb_next_obs.append(obs.copy()) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=obs.dtype) mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_dones = np.asarray(mb_dones, dtype=np.bool) #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones) np.savez_compressed(filename, obs=mb_obs) return filename
def make_env(): env = utils.make_general_env(nenvs, seed=rank) env = wrappers.add_final_wrappers(env) return env
def enjoy_env_sess(sess, checkpoint, overlap): #base_name = str(8*checkpoint) + 'M' #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name) should_eval = True mpi_print('test levels seed', Config.SET_SEED) mpi_print('test levels ', Config.NUM_LEVELS) rep_count = 50 env = utils.make_general_env(20) env = wrappers.add_final_wrappers(env) nenvs = env.num_envs sess.run(tf.global_variables_initializer()) args_now = Config.get_args_dict() #args_run = utils.load_args() agent = create_act_model(sess, env, nenvs) # load name is specified by config.RESTORE_ID adn return True/False if checkpoint != 32: base_name = str(8 * checkpoint) + 'M' elif checkpoint == 0: mean_score = 0.0 succ_rate = 0.0 wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) return mean_score, succ_rate else: base_name = None sess.run(tf.global_variables_initializer()) # env init here load_file = setup_utils.restore_file(Config.RESTORE_ID, overlap_config=overlap, base_name=base_name) is_loaded = utils.load_params_for_scope(sess, 'model') if not is_loaded: mpi_print('NO SAVED PARAMS LOADED') return mean_score, succ_rate obs = env.reset() t_step = 0 scores = np.zeros((nenvs, rep_count)) eplens = np.zeros((nenvs, rep_count)) #scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) # curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) def rollout(obs, state, done): """rollout for rep * nenv times and return scores""" t = 0 count = 0 rews = np.zeros((nenvs, rep_count)) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) rews[:, count] += rew t += 1 for i, d in enumerate(done): if d: eplens[i][count] = t if score_counts[i] < rep_count: score_counts[i] += 1 count = score_counts[i] - 1 # aux score if 'episode' in info[i]: scores[i][count] = info[i].get('episode')['r'] return scores, rews, eplens if is_loaded: mpi_print(load_file) scores, rews, eplens = rollout(obs, state, done) size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() if size == 1: if rank == 0: testset_size = rep_count * nenvs utils.save_pickle(scores, Config.LOGDIR + 'scores') mean_score = np.sum(scores) / testset_size succ_rate = np.sum(scores == 10.0) / testset_size mpi_print('cpus ', size) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) steps_elapsed = checkpoint * 8000000 mpi_print('steps_elapsed:', steps_elapsed) mpi_print('mean score', mean_score) wandb.log({ 'Rew_mean': mean_score, 'Succ_rate': succ_rate, 'Step_elapsed': steps_elapsed }) #mpi_print('mean score of each env',[np.mean(s) for s in scores]) else: testset_size = rep_count * nenvs succ = np.sum(scores=10.0) / testset_size succ_rate = utils.mpi_average([succ]) mean_score_tmp = np.sum(scores) / testset_size mean_score = utils.mpi_average([mean_score_tmp]) if rank == 0: mpi_print('testset size', rep_count * nenvs * size) mpi_print('load file name', load_file) mpi_print('testset size', testset_size) # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs # each one has a new seed(maybe counted) # mpi_print('score detail',scores.flatten()) mpi_print('succ_rate', succ_rate) mpi_print('mean score', mean_score) wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate}) return mean_score, succ_rate
def test(sess, load_path, env, should_render=False, rep_count=Config.REP): rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL if should_eval: #env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs model = load_model(sess, filename) agent = create_act_model(sess, env, nenvs) sess.run(tf.global_variables_initializer()) loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) while should_continue(): action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] if t_step % 100 == 0: mpi_print('t', t_step, values[0], done[0], rew[0], curr_rews[0], np.shape(obs)) maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = { 'steps_elapsed': steps_elapsed, } if should_eval: testset_size = rep_count * nenvs mean_score = np.sum(scores) / testset_size succ_rate = np.sum(scores == 10.0) / testset_size max_idx = np.argmax(scores) mpi_print('max idx', max_idx) mpi_print('steps_elapsed', steps_elapsed) if size > 1: mean_score = utils.mpi_average([mean_score]) mpi_print('mpi_mean', mpi_mean_score) wandb.log({'Test_Rew_mean': mean_score, 'Test_Succ_rate': succ_rate}) result['scores'] = scores result['testset_size'] = testset_size result['test_rew_mean'] = mean_score result['test_succ_rate'] = succ_rate return result
def main(): # check and use GPU if available if not use CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device is {}".format(device)) # arguments args = setup_utils.setup_and_load() print("Arguments are") print(args) # set seed comm = MPI.COMM_WORLD rank = comm.Get_rank() time_int = int(time.time()) seed = time_int % 10000 set_global_seeds(seed * 100 + rank) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) if Config.NUM_ENVS > 1: print("To do: add multi env support") nenvs = 1 #Config.NUM_ENVS env = utils.make_general_env(nenvs, seed=rank) # wrap env (not needed with Coinrun options) #env = dqn_utils.wrap_deepmind(env, clip_rewards=False, frame_stack=True, scale=False) action_size = env.action_space.n # set up pysyft workers num_workers = 2 hook = sy.TorchHook(torch) worker_1 = sy.VirtualWorker(hook, id='worker_1') worker_2 = sy.VirtualWorker(hook, id='worker_2') secure_worker = sy.VirtualWorker(hook, id='secure_worker') worker_list = [] worker_list.append(worker_1) worker_list.append(worker_2) # Training hyperparameters timesteps = 250000 #2000000 #1000#2000000 # run env for this many time steps hidden_size = 512 # side of hidden layer of FFNN that connects CNN to outputs is_dueling = True is_impala_net = False learning_rate = 0.0001 # learning rate of optimizer batch_size = 32 # size of batch trained on start_training_after = 10000 # start training NN after this many timesteps discount = 0.99 # discount future states by epsilon_start = 1.0 # epsilon greedy start value epsilon_min = 0.02 # epsilon greedy end value epsilon_decay_steps = timesteps * .5 # decay epsilon over this many timesteps epsilon_step = (epsilon_start - epsilon_min) / ( epsilon_decay_steps) # decrement epsilon by this amount every timestep update_target_every = 1 # update target network every this steps tau = 0.001 # soft target updating amount frame_skip = 4 #hold action for this many frames save_every = 10000 #timesteps to save model after train_every = 1 # number of times to train # create replay buffer replay_size = 50000 # size of replay buffer replay_buffer_list = [] for i in range(num_workers): replay_buffer = dqn_utils.ReplayBuffer(max_size=replay_size) replay_buffer_list.append(replay_buffer) # create DQN Agent dqn_agent = dqn_utils.DQNAgent(action_size, hidden_size, learning_rate, is_dueling, is_impala_net) # create states for every env stats_every = 10 #10 # print stats every this many episodes stats_list = [] # store stats for each env init here for i in range(num_workers): temp_dict = {} temp_dict['episode'] = 0 temp_dict['mean_reward_total'] = 0. temp_dict['mean_ep_length_total'] = 0. temp_dict['mean_reward_recent'] = 0. temp_dict['mean_ep_length_recent'] = 0. temp_dict['episode_loss'] = 0. temp_dict['episode_reward'] = 0. temp_dict['episode_length'] = 0. stats_list.append(temp_dict) # training loop epsilon = epsilon_start # take no_action on first step to get state # use state to tell which level # env.reset() does not produce and observation in CoinRun until an action is taken no_action = np.zeros((nenvs, ), dtype=np.int32) state_list, _, _, _ = env.step(no_action) # assign each level to a worker # coinrun doesn't have a way to tell the current level so take mean of first screen of level and use dictionary to assign levels # worker_level is used to tell which replay buffer to put data into (ie which worker is training) level_worker_dict = {} levels_assigned = 0 def get_worker_level(state, lw_dict, la, nw): temp_key = int(1000 * np.mean(state)) if temp_key not in lw_dict: la += 1 lw_dict[temp_key] = la % nw print("Adding new key to level_worker_dict. current size is: {}". format(len(lw_dict))) print(lw_dict) return lw_dict[temp_key], lw_dict, la worker_level, level_worker_dict, levels_assigned = get_worker_level( state_list[0], level_worker_dict, levels_assigned, num_workers) for ts in range(timesteps): # decay epsilon epsilon -= epsilon_step if epsilon < epsilon_min: epsilon = epsilon_min # select an action from the agent's policy action = dqn_agent.select_action(state_list[0].squeeze(axis=-1), epsilon, env, batch_size) # enter action into the env reward_frame_skip = 0. for _ in range(frame_skip): next_state_list, reward_list, done_list, _ = env.step(action) stats_list[worker_level]['episode_reward'] += reward_list[0] reward_frame_skip += reward_list[0] if done_list[0]: break done = done_list[0] stats_list[worker_level]['episode_length'] += 1 # add experience to replay buffer replay_buffer_list[worker_level].add( (state_list[0].squeeze(axis=-1), next_state_list[0].squeeze(axis=-1), action, reward_frame_skip, float(done))) if done: # env.reset doesn't reset the coinrun env but does produce image of first frame, which we can use get the worker_level state_list = env.reset() worker_level, level_worker_dict, levels_assigned = get_worker_level( state_list[0], level_worker_dict, levels_assigned, num_workers) #update stats stats_list[worker_level]['episode'] += 1 #overall averages stats_list[worker_level]['mean_reward_total'] = (stats_list[worker_level]['mean_reward_total'] * ( stats_list[worker_level]['episode'] - 1) + stats_list[worker_level]['episode_reward']) / \ stats_list[worker_level]['episode'] stats_list[worker_level]['mean_ep_length_total'] = (stats_list[worker_level]['mean_ep_length_total'] * ( stats_list[worker_level]['episode'] - 1) + stats_list[worker_level]['episode_length']) / \ stats_list[worker_level]['episode'] # keep running average of last stats_every episodes if stats_list[worker_level]['episode'] >= stats_every: temp_episodes_num = stats_every else: temp_episodes_num = stats_list[worker_level]['episode'] stats_list[worker_level]['mean_reward_recent'] = ( stats_list[worker_level]['mean_reward_recent'] * (temp_episodes_num - 1) + stats_list[worker_level]['episode_reward']) / temp_episodes_num stats_list[worker_level]['mean_ep_length_recent'] = ( stats_list[worker_level]['mean_ep_length_recent'] * (temp_episodes_num - 1) + stats_list[worker_level]['episode_length']) / temp_episodes_num # reset episode stats stats_list[worker_level]['episode_reward'] = 0. stats_list[worker_level]['episode_length'] = 0 # print stats if stats_list[worker_level]['episode'] % stats_every == 0: print( 'w: {}'.format(worker_level), 'epi: {}'.format(stats_list[worker_level]['episode']), 't: {}'.format(ts), 'r: {:.1f}'.format( stats_list[worker_level]['mean_reward_total']), 'l: {:.1f}'.format( stats_list[worker_level]['mean_ep_length_total']), 'r r: {:.1f}'.format( stats_list[worker_level]['mean_reward_recent']), 'r l: {:.1f}'.format( stats_list[worker_level]['mean_ep_length_recent']), 'eps: {:.2f}'.format(epsilon), 'loss: {:.1f}'.format( stats_list[worker_level]['episode_loss'])) stats_list[worker_level]['episode_loss'] = 0. else: state_list = next_state_list if ts > start_training_after: # train the agent # typical DQN gather experiences and trains once every iteration # train_every can modify that to 'train_every' many times every 'train_every'th iteration # example: if train_every=10 then train 10 times every 10th iteration if ts % train_every == 0: # pysyft federated learning training # copy model to each worker # each worker trains on its own data from its own replay buffer # updated models from each worker sent to a secure worker who updates the new model worker_dqn_list = [] worker_dqn_target_list = [] worker_opt_list = [] for i in range(num_workers): worker_dqn_list.append(dqn_agent.train_net.copy().send( worker_list[i])) worker_dqn_target_list.append( dqn_agent.target_net.copy().send(worker_list[i])) worker_opt_list.append( optim.Adam(params=worker_dqn_list[i].parameters(), lr=learning_rate)) for i in range(num_workers): for _ in range(train_every): # sample a batch from the replay buffer x0, x1, a, r, d = replay_buffer_list[i].sample( batch_size) # turn batches into tensors and attack to GPU if available state_batch = torch.FloatTensor(x0).to(device) state_batch = torch.unsqueeze(state_batch, dim=1) next_state_batch = torch.FloatTensor(x1).to(device) next_state_batch = torch.unsqueeze(next_state_batch, dim=1) action_batch = torch.LongTensor(a).to(device) reward_batch = torch.FloatTensor(r).to(device) done_batch = torch.FloatTensor(1. - d).to(device) # send data to worker worker_state_batch = state_batch.send(worker_list[i]) worker_next_state_batch = next_state_batch.send( worker_list[i]) worker_action_batch = action_batch.send(worker_list[i]) worker_reward_batch = reward_batch.send(worker_list[i]) worker_done_batch = done_batch.send(worker_list[i]) train_q = worker_dqn_list[i]( worker_state_batch).gather(1, worker_action_batch) with torch.no_grad(): # Double DQN: get argmax values from train network, use argmax in target network train_argmax = worker_dqn_list[i]( worker_next_state_batch).max(1)[1].view( batch_size, 1) target_net_q = worker_reward_batch + worker_done_batch * discount * \ worker_dqn_target_list[i](worker_next_state_batch).gather(1, train_argmax) # get loss between train q values and target q values # DQN implementations typically use MSE loss or Huber loss (smooth_l1_loss is similar to Huber) # loss_fn = nn.MSELoss() # loss = loss_fn(train_q, target_net_q) loss = F.smooth_l1_loss(train_q, target_net_q) # optimize the parameters with the loss worker_opt_list[i].zero_grad() loss.backward() for param in worker_dqn_list[i].parameters(): param.grad.data.clamp_(-1, 1) worker_opt_list[i].step() # get loss stats #print("loss is {}".format(loss)) temp_loss = loss.get() #print("loss get is {}".format(temp_loss)) stats_list[i]['episode_loss'] += temp_loss.detach( ).cpu().numpy() # move the worker trained model to secure worker for updating the centralized DQN worker_dqn_list[i].move(secure_worker) with torch.no_grad(): # first worker replaces centralized DQN parameters, then do keep a running average as each new worker's params are found if i == 0: dqn_agent.train_net.load_state_dict( worker_dqn_list[i].get().state_dict()) else: tau = 1. / (1 + i) temp_net = worker_dqn_list[i].get() for dqn_var, temp_var in zip( dqn_agent.train_net.parameters(), temp_net.parameters()): dqn_var.data.copy_((1. - tau) * dqn_var.data + (tau) * temp_var.data) # save the network if ts % save_every == 0: save_string = "saved_models/dqn_model_{}_{}.pt".format( time_int, ts) torch.save(dqn_agent.train_net.state_dict(), save_string) stats_save_string = "saved_models/stats_{}_{}.pickle".format( time_int, ts) with open(stats_save_string, 'wb') as handle: pickle.dump(stats_list, handle) # update the target network dqn_agent.update_target_network_soft(ts, update_target_every, tau) print("save final model") save_string = "saved_models/dqn_model_{}_FINAL.pt".format(time_int) torch.save(dqn_agent.train_net.state_dict(), save_string) stats_save_string = "saved_models/stats_{}_FINAL.pickle".format(time_int) with open(stats_save_string, 'wb') as handle: pickle.dump(stats_list, handle)
def train(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) main_utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank)) print("==================================") print("Learning rate :{}, batch size: {}".format(Config.LR, Config.BATCH_SIZE)) act = deepq.learn( env, # network=Config.ARCHITECTURE, network="conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], lr=Config.LR, batch_size=Config.BATCH_SIZE, gamma=0.99, total_timesteps=Config.TOTAL_TIMESTEPS, buffer_size=Config.BUFFER_SIZE, print_freq=10, checkpoint_freq=Config.CHECKPOINT_FREQ, checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID), # render=Config.RENDER, callback=None, exploration_fraction=0.5, exploration_final_eps=0.1, prioritized_replay=True, train_freq=4, learning_starts=10000, target_network_update_freq=1000) # act = deepq.learn( # env, # # network=Config.ARCHITECTURE, # network="impala_cnn", # lr=Config.LR, # batch_size=Config.BATCH_SIZE, # gamma=0.99, # total_timesteps=Config.TOTAL_TIMESTEPS, # buffer_size=Config.BUFFER_SIZE, # print_freq=10, # checkpoint_freq=Config.CHECKPOINT_FREQ, # checkpoint_path="{}/ckpts/{}".format(Config.SAVE_PATH, Config.RUN_ID), # # render=Config.RENDER, # callback=None, # exploration_fraction=0.6, # exploration_final_eps=0.04, # prioritized_replay=True, # train_freq=4, # learning_starts=10000, # target_network_update_freq=1000 # ) print("Saving model to {}/saved_models".format(Config.SAVE_PATH)) act.save("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))
def train(config): """Training process of DQN. """ # Initialize the envrioment env = utils.Scalarize(coinrun_utils.make_general_env(1, seed=1)) # Create log directory and save directory if it does not exist if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) # Create summary writer start_time = time.time() st = datetime.datetime.fromtimestamp(start_time).strftime( '%Y-%m-%d %H:%M:%S') tr_writer = SummaryWriter(log_dir=os.path.join( config.log_dir, "DQN Traning {} c_rew {} numlvl {} seed {}".format( st, config.CUSTOM_REWARD_SHAPING, conrun_config.NUM_LEVELS, conrun_config.SET_SEED))) # Prepare checkpoint file and model file to save and load from checkpoint_file = os.path.join(config.save_dir, "checkpoint.pth") # Initialize training dqn = QLEARNING(config, env) # Make sure that the model is set for training dqn.Q_function.train() dqn.target_Q_function.eval( ) # target is never learning but getting copied from Q_function # Check for existing training results. If it existst, and the configuration # is set to resume `config.resume==True`, resume from previous training. If # not, delete existing checkpoint. if os.path.exists(checkpoint_file): if config.resume: print("Checkpoint found! Resuming") # Read checkpoint file. load_res = torch.load(checkpoint_file, map_location="cpu") dqn.Q_function.load_state_dict(load_res["model"]) dqn.update_target_model() dqn.optimizer.load_state_dict(load_res["optimizer"]) else: os.remove(checkpoint_file) max_avg_reward = 0.04 # Training loop for i in range(config.num_episodes): state = env.reset() ep_reward = 0 ep_length = 0 terminated = False while True: if config.render_play: env.render() action = dqn.choose_action(state) next_state, reward, done, info = env.step(action) if config.CUSTOM_REWARD_SHAPING: reward, terminated = rewardShaping(action, terminated, done, ep_length, reward, state, next_state) ep_length += 1 ep_reward += reward dqn.store_transition(state, action, reward, next_state, done) dqn.learn(tr_writer) if done: break state = copy.copy(next_state) print( "finished ep: {} , ep_rew {} len {}. esp {}. time_passed {}, memory_size {} " .format(i, ep_reward, ep_length, dqn.epsilon, time.time() - start_time, dqn.memory.size())) tr_writer.add_scalar("ep_length", ep_length, global_step=i) ep_reward_norm = ep_reward if config.CUSTOM_REWARD_SHAPING: ep_reward_norm = ep_reward / 100 # so it will be in the same range as default reward in graph ploting avg_reward_norm = ep_reward_norm / ep_length tr_writer.add_scalar("ep_reward", ep_reward_norm, global_step=i) tr_writer.add_scalar("Avg. reward per step", avg_reward_norm, global_step=i) if config.test_while_train and avg_reward_norm > max_avg_reward and dqn.memory.size( ) > dqn.minimum_memory: if test(config, dqn.Q_function, 3): torch.save( { "current_ep": i, "model": dqn.Q_function.state_dict(), "optimizer": dqn.optimizer.state_dict(), }, os.path.join( config.save_dir, "bestmodel_{}_cRew_{}_numlvl_{}_seed_{}.pth".format( i, config.CUSTOM_REWARD_SHAPING, conrun_config.NUM_LEVELS, conrun_config.SET_SEED))) max_avg_reward = avg_reward_norm dqn.Q_function.train() if (i % 50) == 0: # hardcoded save checkpoint interval torch.save( { "current_ep": i, "model": dqn.Q_function.state_dict(), "optimizer": dqn.optimizer.state_dict(), }, checkpoint_file) torch.save( { "current_ep": i, "model": dqn.Q_function.state_dict(), "optimizer": dqn.optimizer.state_dict(), }, os.path.join( config.save_dir, "final_{}_rShaping_{}_numlvl_{}_seed_{}_time_{}.pth".format( i, config.CUSTOM_REWARD_SHAPING, conrun_config.NUM_LEVELS, conrun_config.SET_SEED, st))) env.close()
def enjoy_env_sess(sess, DIR_NAME): should_render = True should_eval = Config.TRAIN_EVAL or Config.TEST_EVAL rep_count = Config.REP mpi_print = utils.mpi_print file_name = '%s/%s.txt' % (DIR_NAME, Config.RESTORE_ID) f_io = open(file_name, 'a') if should_eval: if Config.TEST_NUM_EVAL > -1: env = utils.make_general_env(Config.TEST_NUM_EVAL) else: env = utils.make_general_env(Config.NUM_EVAL) should_render = False else: env = utils.make_general_env(1) env = wrappers.add_final_wrappers(env) if should_render: from gym.envs.classic_control import rendering nenvs = env.num_envs agent = create_act_model(sess, env, nenvs) num_actions = env.action_space.n init_rand = tf.variables_initializer( [v for v in tf.global_variables() if 'randcnn' in v.name]) sess.run(tf.compat.v1.global_variables_initializer()) soft_numpy = tf.placeholder(tf.float32, [nenvs, num_actions], name='soft_numpy') dist = tfp.distributions.Categorical(probs=soft_numpy) sampled_action = dist.sample() loaded_params = utils.load_params_for_scope(sess, 'model') if not loaded_params: print('NO SAVED PARAMS LOADED') obs = env.reset() t_step = 0 if should_render: viewer = rendering.SimpleImageViewer() should_render_obs = not Config.IS_HIGH_RES def maybe_render(info=None): if should_render and not should_render_obs: env.render() maybe_render() scores = np.array([0] * nenvs) score_counts = np.array([0] * nenvs) curr_rews = np.zeros((nenvs, 3)) def should_continue(): if should_eval: return np.sum(score_counts) < rep_count * nenvs return True state = agent.initial_state done = np.zeros(nenvs) sess.run(init_rand) while should_continue(): if Config.USE_LSTM == 8425 or Config.USE_LSTM == 1081: q_actions, values, state, _ = agent.step(obs, state, done) # e-greedy greedy_flag = np.random.rand(q_actions.shape[0]) greedy_flag = greedy_flag < 0.1 greedy_flag.astype(np.int) random_actions = np.random.randint(0, num_actions, size=q_actions.shape[0]) action = random_actions * greedy_flag + (1 - greedy_flag) * q_actions else: total_soft = agent.get_softmax(obs, state, done) action = sess.run([sampled_action], {soft_numpy: total_soft}) action = action[0] #action, values, state, _ = agent.step(obs, state, done) obs, rew, done, info = env.step(action) #scipy.misc.imsave('raw_inputs.png', obs[0]) #print(dd) if should_render and should_render_obs: if np.shape(obs)[-1] % 3 == 0: ob_frame = obs[0, :, :, -3:] else: ob_frame = obs[0, :, :, -1] ob_frame = np.stack([ob_frame] * 3, axis=2) viewer.imshow(ob_frame) curr_rews[:, 0] += rew for i, d in enumerate(done): if d: if score_counts[i] < rep_count: score_counts[i] += 1 if 'episode' in info[i]: scores[i] += info[i].get('episode')['r'] maybe_render(info[0]) t_step += 1 if should_render: time.sleep(.02) if done[0]: if should_render: mpi_print('ep_rew', curr_rews) curr_rews[:] = 0 result = 0 if should_eval: mean_score = np.mean(scores) / rep_count max_idx = np.argmax(scores) result = mean_score f_io.write("{}\n".format(result)) f_io.close() return result