def main(): numOfTests = 40 env_args = { 'episode_life': False, 'clip_rewards': False, 'crop': True, 'rotate': True } env = VecFrameStack( make_vec_env("gvgai-zelda-lvl0-v0", numOfTests, 43, wrapper_kwargs=env_args), 4) policy = build_policy(env, "cnn") model = Model(policy=policy, env=env, nsteps=5) model.load('logs/test_4*5_r1_right/checkpoints/260000') nh, nw, nc = env.observation_space.shape result = dict() for j in range(201, 601): # obs = np.zeros((numOfTests, nh, nw, nc), dtype=np.uint8) done = np.array([False] * numOfTests) env.venv.set_level( "GVGAI_GYM/gym_gvgai/envs/games/zelda_v0/zelda_lvl{}.txt".format( j)) obs = env.reset() infos = [False] * numOfTests # dones = [False] * numOfTests while not all(done): actions, values, state, _ = model.step(obs) obs, rewards, dones, info = env.step(actions) done[np.where(dones != False)] = True for i in np.where(dones != False)[0].tolist(): if not infos[i]: # print(info) del info[i]["grid"] del info[i]["ascii"] infos[i] = info[i] # print(np.where(dones!=False)[0]) # print(done) # print(infos) # print(dones) win = [1 if (i['winner'] == 'PLAYER_WINS') else 0 for i in infos] # score = [i['episode']['r'] for i in infos] # steps = [i['episode']['l'] for i in infos] # time = [i['episode']['t'] for i in infos] print("level {}".format(j), win) result[j] = infos env.close() with open("result_4*5_r1_right_200~600", "wb") as f: pickle.dump(result, f)
class A2CTester: """This class tests the network used in an A2C agent to see how it performs in the environment. """ def __init__(self, model, env_id, num_env: int=4, seed: int=1, wrapper_kwargs=None, start_index=0, stack_frames: int=4): if wrapper_kwargs is None: wrapper_kwargs = {} wrapper_kwargs["episode_life"] = False self.env = VecFrameStack(_make_atari_env(env_id, num_env, seed, wrapper_kwargs, start_index), stack_frames) self.model = model self.end_ix = np.zeros(num_env, dtype=bool) self.states = model.initial_state self.obs = None self.dones = None def end_condition(self): self.end_ix = np.logical_or(self.end_ix, self.dones) return self.end_ix.all() def play_game(self): self.obs = self.env.reset() self.end_ix = np.zeros(len(self.obs), dtype=bool) self.dones = np.zeros(len(self.obs), dtype=bool) total_rewards = [] total_len = [] current_rewards = np.zeros(len(self.obs)) game_len = 1 while not self.end_condition(): try: actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) obs, rewards, dones, infos = self.env.step(actions) game_len += 1 current_rewards += rewards self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 total_rewards.append(float(current_rewards[n])) total_len.append(game_len) self.obs = obs except RuntimeError: break return total_rewards, total_len
def main(): env = VecFrameStack(make_sf2_env(), 1) obs = env.reset() n_steps = 128, # 5 * FPS options = { 'network': 'mlp', # 'impala_cnn' 'env': venv, 'total_timesteps': 40000000, 'nsteps': n_steps, # 5 * FPS, # TODO: Do we still need to pass nsteps here? 'q_coef': 1.0, 'ent_coef': 0.001, 'max_grad_norm': 10, 'lr': 7e-4, 'lrschedule': 'linear', 'rprop_epsilon': 1e-5, 'rprop_alpha': 0.99, 'gamma': 0.99, 'log_interval': 1000, 'buffer_size': 50000, 'replay_ratio': 4, 'replay_start': 10000, 'c': 10.0, 'trust_region': True, 'delta': 1, 'alpha': 0.99, # 'load_path': MODEL_PATH, 'save_interval': 1000, # neuronal network parameters 'activation': tf.nn.relu, 'num_layers': 2, # 4, 2 'num_hidden': 48, # 64, 64 'layer_norm': False, } models = ( Acer(**options), Acer(**options) ) runner = Runner(env, models, n_steps) while True: runner.run() # obs, rew, done, info = env.step(( # env.action_space.sample(), # env.action_space.sample() # )) # env.render() # if done: # obs = env.reset() env.close()
def train(env_id, seed, policy, load_path, num_episodes, frame_skip, no_render): env = make_neyboy_env(env_id, 1, seed, allow_early_resets=True, frame_skip=frame_skip, save_video=True) env = VecFrameStack(env, 4) policy = build_policy(env, policy) ob_space = env.observation_space ac_space = env.action_space ent_coef = .01 vf_coef = 0.5 max_grad_norm = 0.5 model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=env.num_envs, nbatch_train=0, nsteps=0, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model.load(load_path) for _ in range(num_episodes): if not no_render: env.render() observation, done = env.reset(), False if not no_render: env.render() episode_rew = 0 score = 0 while not done: if not no_render: env.render() action, _, _, _ = model.step(observation) observation, reward, done, info = env.step(action) episode_rew += reward score = info[0] print('Episode reward={}, info={}'.format(episode_rew, score))
class ShootEnv(Env): def __init__(self): self.game = DoomGame() self.game.load_config('O:\\Doom\\scenarios\\cig_flat2.cfg') self.game.add_game_args( "-host 1 -deathmatch +timelimit 1.0 " "+sv_forcerespawn 1 +sv_noautoaim 1 +sv_respawnprotect 1 +sv_spawnfarthest 1 +sv_nocrouch 1 " "+viz_respawn_delay 0") self.game.set_mode(Mode.PLAYER) self.game.set_labels_buffer_enabled(True) self.game.set_depth_buffer_enabled(True) self.game.set_screen_resolution(ScreenResolution.RES_320X240) self.game.add_available_game_variable(GameVariable.FRAGCOUNT) #define navigation env class NavigatorSubEnv(Env): def __init__(self, game): self.action_space = Discrete(3) self.observation_space = Box(low=0, high=255, shape=(84, 84, 3), dtype=np.uint8) self._game = game def step(self, action): #-1 means it doesn't really controls the game if action > -1: one_hot_action = [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0]] self._game.make_action(one_hot_action[action], 4) if self._game.is_episode_finished(): self._game.new_episode() if self._game.is_player_dead(): self._game.respawn_player() obs = get_observation(self._game.get_state()) return get_observation( self._game.get_state(), real_frame=True), 0, check_enemy_enter(obs), None def seed(self, seed=None): pass def reset(self): return get_observation(self._game.get_state(), real_frame=True) def render(self, mode='human'): pass self.navigator = VecFrameStack( VecEnvAdapter([NavigatorSubEnv(self.game)]), 4) #define navigation network self.navigation_policy = Model(CnnPolicy, self.navigator.observation_space, self.navigator.action_space, nenvs=1, nsteps=1) self.navigation_policy.load( 'O:\\Doom\\baselinemodel\\navigate_real2.dat') self.action_space = Discrete(3) #turn L, turn R, fire self.observation_space = Box(low=0, high=255, shape=(84, 84, 3), dtype=np.uint8) self.available_actions = [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1]] def seed(self, seed=None): self.game.set_seed(seed) self.game.init() self.game.send_game_command("removebots") for i in range(8): self.game.send_game_command("addbot") def reset(self): obs_for_navigator = self.navigator.reset() while True: actions, _, _, _ = self.navigation_policy.step(obs_for_navigator) obs_for_navigator, _, navi_done, _ = self.navigator.step(actions) if navi_done: break obs = get_observation(self.game.get_state()) assert check_enemy_enter(obs) return get_observation(self.game.get_state(), real_frame=True) def step(self, action): old_fragcount = self.game.get_game_variable(GameVariable.FRAGCOUNT) self.game.make_action(self.available_actions[action], 4) new_fragcount = self.game.get_game_variable(GameVariable.FRAGCOUNT) rew = new_fragcount - old_fragcount done = False if self.game.is_episode_finished(): done = True self.game.new_episode() self.game.send_game_command("removebots") for i in range(8): self.game.send_game_command("addbot") if self.game.is_player_dead(): self.game.respawn_player() done = True if action == 2: # fire rew -= 0.05 state = self.game.get_state() obs = get_observation(state) if check_enemy_enter(obs): rew += 0.01 if check_enemy_leave(obs): done = True return get_observation(state, real_frame=True), rew, done, None
from baselines.common.vec_env.vec_frame_stack import VecFrameStack from models import CNN_Net import torch import os # get the tensors def get_tensors(obs): return torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32) if __name__ == '__main__': args = get_args() # create the environment env = VecFrameStack(make_atari_env(args.env_name, 1, args.seed), 4) # start to create the model model_path = args.save_dir + args.env_name + '/model.pt' network = CNN_Net(env.action_space.n) network.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) # start to do the test obs = env.reset() for _ in range(10000): env.render() obs_tensor = get_tensors(obs) with torch.no_grad(): _, pi = network(obs_tensor) actions = torch.argmax(pi, dim=1).item() obs, reward, done, _ = env.step([actions]) env.close()
class Runner(object): def __init__(self, *, env, model, nsteps, gamma, lam, writer=None): self.env = env self.model = model nenv = env.num_envs self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name) self.obs[:] = env.reset() self.gamma = gamma self.lam = lam self.nsteps = nsteps self.states = model.initial_state self.dones = [False for _ in range(nenv)] # For summaries self.writer = writer self.summary = tf.Summary() self.validation_num = 0 self.val_env = VecFrameStack(make_atari_env(env.env_id, env.num_envs, 100), 4, env_id=env.env_id) # NEEDS MORE WORK def validate(self, val_num): obs = self.val_env.reset() dones = np.zeros((self.val_env.num_envs), dtype=bool) total_r = np.zeros(self.val_env.num_envs, dtype=float) total_steps = np.zeros(self.val_env.num_envs, dtype=float) static_dones = np.zeros((self.val_env.num_envs), dtype=bool) # Copy policy #val_policy = copy.deepcopy(self.model.act_model) val_policy = self.model.act_model # Init policy #states = val_policy.states while(not np.all(static_dones)): actions, values, states, neglogpacs = val_policy.step(obs, dones) (obs, rewards, dones, info) = self.val_env.step(actions) total_r += np.multiply(np.logical_not(static_dones), rewards) static_dones = np.logical_or(dones, static_dones) total_steps += np.multiply(np.logical_not(static_dones), 1) # TB Reporting self.summary.value.add(tag='Mean_Validation_Reward', simple_value=total_r.mean()) self.summary.value.add(tag='Mean_Validation_Episode_length', simple_value=total_steps.mean()) self.writer.add_summary(self.summary, val_num) def run(self, update_num): mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] mb_states = self.states epinfos = [] for _ in range(self.nsteps): actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) mb_obs.append(self.obs.copy()) mb_actions.append(actions) mb_values.append(values) mb_neglogpacs.append(neglogpacs) mb_dones.append(self.dones) self.obs[:], rewards, self.dones, infos = self.env.step(actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) mb_rewards.append(rewards) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) # TB Reporting self.summary.value.add(tag='Avg_Batch_Reward', simple_value=mb_rewards.mean()) self.writer.add_summary(self.summary, update_num) mb_actions = np.asarray(mb_actions) mb_values = np.asarray(mb_values, dtype=np.float32) mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = self.model.value(self.obs, self.states, self.dones) #discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t+1] nextvalues = mb_values[t+1] delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, epinfos)
learning_returns_extrapolate = [] pred_returns_extrapolate = [] for checkpoint in checkpoints_extrapolate: model_path = model_dir + "/models/" + env_name + "_25/" + checkpoint demonstrator.load(model_path) for i in range(episode_count): done = False traj = [] r = 0 ob = env.reset() #traj.append(ob) #print(ob.shape) steps = 0 acc_reward = 0 while True: action = demonstrator.act(ob, r, done) ob, r, done, _ = env.step(action) ob_processed = preprocess(ob, env_name) ob_processed = ob_processed[0] #get rid of spurious first dimension ob.shape = (1,84,84,4) traj.append(ob_processed) steps += 1 acc_reward += r[0] if done: print("checkpoint: {}, steps: {}, return: {}".format(checkpoint, steps,acc_reward)) break
class Runner(Process): def __init__(self, env_id, seed, ob_space, ac_space, output_queue, task_index, cluster): Process.__init__(self) self.env_id = env_id self.seed = seed self.output_queue = output_queue self.ob_space = ob_space self.ac_space = ac_space self.task_index = task_index self.cluster = cluster set_global_seeds(seed) #print(seed, ' created!') def run(self): server = tf.train.Server(self.cluster, job_name='actor', task_index=self.task_index) with tf.Session(server.target) as sess: #shared_job_device = '/gpu:0' shared_job_device = '/job:ps/task:0/' actor_device = "/job:actor/task:{}/cpu:0".format(self.task_index) #with tf.device(tf.train.replica_device_setter(ps_device=shared_job_device, worker_device=actor_device)): with tf.device(shared_job_device): global_model = Model(sess, self.ob_space, self.ac_space) with tf.device(actor_device): self.model = Model(sess, self.ob_space, self.ac_space, scope='sub-' + str(self.task_index)) self.env = VecFrameStack( make_atari_env(self.env_id, 1, self.seed), 4) obs = self.env.reset() sess.run( tf.variables_initializer([ v for v in tf.global_variables() if v.name.startswith('sub-' + str(self.task_index)) ])) sess.run(tf.global_variables_initializer()) time.sleep(5) while True: obs_list = [] acs_list = [] ret_list = [] total_ret = 0 total_t = 0 tmp_t = 0 obs_buf = [] acs_buf = [] self.model.copy('parent') while True: acs = self.model.step(obs) new_obs, rewards, dones, _ = self.env.step(acs) total_ret += rewards[0] total_t += 1 tmp_t += 1 obs_buf.append(obs[0]) acs_buf.append(acs[0]) #print(obs, acs, new_obs, rewards, dones) obs = new_obs if rewards[0] != 0: if rewards[0] > 0: obs_list.extend(obs_buf) acs_list.extend(acs_buf) ret_list.extend( [rewards[0] for _ in range(tmp_t)]) obs_buf = [] acs_buf = [] tmp_t = 0 if dones[0]: obs_arr = np.array(obs_list) acs_arr = np.array(acs_list) ret_arr = np.array(ret_list) #ret_arr = ret_arr + 0.01*total_ret while self.output_queue.qsize() >= 3: time.sleep(0.01) self.output_queue.put((obs_arr, acs_arr, ret_arr, total_ret, total_t)) #print(obs_arr.shape, acs_arr.shape, ret_arr.shape, total_ret, total_t) break