def test(flags, num_episodes: int = 10): if flags.xpid is None: checkpointpath = "./latest/model.tar" else: checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) gym_env = create_env(flags) env = environment.Environment(gym_env) model = Net(gym_env.observation_space.shape, gym_env.action_space.n, flags.use_lstm) model.eval() checkpoint = torch.load(checkpointpath, map_location="cpu") model.load_state_dict(checkpoint["model_state_dict"]) observation = env.initial() returns = [] while len(returns) < num_episodes: if flags.mode == "test_render": env.gym_env.render() agent_outputs = model(observation) policy_outputs, _ = agent_outputs observation = env.step(policy_outputs["action"]) if observation["done"].item(): returns.append(observation["episode_return"].item()) logging.info( "Episode ended after %d steps. Return: %.1f", observation["episode_step"].item(), observation["episode_return"].item(), ) env.close() logging.info("Average returns over %i steps: %.1f", num_episodes, sum(returns) / len(returns))
def test_1(flags, model, num_episodes: int = 10): gym_env = create_env(flags) env = environment.Environment(gym_env) observation = env.initial() returns = [] hidden_state = model.initial_state(batch_size=1) while len(returns) < num_episodes: if flags.mode == "test_render": env.gym_env.render() agent_outputs, new_hidden_state = model(observation, hidden_state) hidden_state = new_hidden_state policy_outputs = agent_outputs observation = env.step(policy_outputs["action"]) if observation["done"].item(): returns.append(observation["episode_return"].item()) logging.info( "Episode ended after %d steps. Return: %.1f", observation["episode_step"].item(), observation["episode_return"].item(), ) hidden_state = model.initial_state(batch_size=1) env.close() return np.mean(returns), np.std(returns)
def act(flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, level_name): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") ######changed next line gym_env = create_env(flags, level_name, seed) env = environment.Environment(gym_env) env_output = env.initial() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout. for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time("model") env_output = env.step(agent_output["action"]) timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def test(flags, num_episodes: int = 10): if flags.xpid is None: checkpointpath = "./latest/model.tar" else: checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar")) ) gym_env = create_env(flags) env = environment.Environment(gym_env) model = Net(num_actions=env.action_space.n) model.eval() checkpoint = torch.load(checkpointpath, map_location="cpu") model.load_state_dict(checkpoint["model_state_dict"]) observation = env.initial() returns = [] video_frames = [] attention_frames = [] hidden_state = model.initial_state(batch_size=1) while len(returns) < num_episodes: if flags.mode == "test_render": env.gym_env.render() agent_outputs, new_hidden_state = model(observation, hidden_state) hidden_state = new_hidden_state policy_outputs, _ = agent_outputs observation = env.step(policy_outputs["action"]) if observation["done"].item(): returns.append(observation["episode_return"].item()) logging.info( "Episode ended after %d steps. Return: %.1f", observation["episode_step"].item(), observation["episode_return"].item(), ) hidden_state = model.initial_state(batch_size=1) if flags.mode == "write_videos": # Save numpy arrays, so we can make videos somewhere else. video_frames = np.asarray(video_frames) with open(videopath, "wb") as f: np.save(f, video_frames) attention_frames = np.asarray(attention_frames) with open(attentionpath, "wb") as f: np.save(f, attention_frames) env.close() logging.info( "Average returns over %i steps: %.1f", num_episodes, sum(returns) / len(returns) )
def test(flags, game_params, num_episodes: int = 10): if flags.xpid is None: raise Exception( "Specify a experiment id with --xpid. `latest` option not working." ) else: checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) replay_dict = dict(save_replay_episodes=1, replay_dir='Replays/', replay_prefix=flags.map_name) sc_env = init_game(game_params['env'], flags.map_name, **replay_dict) model = IMPALA_AC( env=sc_env, device='cpu', **game_params['HPs']) # let's use cpu as default for test obs_processer = IMPALA_ObsProcesser(action_table=model.action_table, **game_params['obs_processer']) env = environment.Environment(sc_env, obs_processer) model.eval() # disable dropout checkpoint = torch.load(checkpointpath, map_location="cpu") model.load_state_dict(checkpoint["model_state_dict"]) observation = env.initial() # env.reset returns = [] while len(returns) < num_episodes: with torch.no_grad(): agent_outputs = model.actor_step(observation) observation = env.step(agent_outputs["sc_env_action"]) if observation["done"].item(): returns.append(observation["episode_return"].item()) logging.info( "Episode ended after %d steps. Return: %.1f", observation["episode_step"].item(), observation["episode_return"].item(), ) env.close() returns = np.array(returns) logging.info("Average returns over %i episodes: %.2f (std %.2f) ", num_episodes, returns.mean(), returns.std()) print("Saving to file") np.save('%s/%s/test_results' % (flags.savedir, flags.xpid), returns)
def act( flags, game_params, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") sc_env = init_game(game_params['env'], flags.map_name, random_seed=seed) obs_processer = IMPALA_ObsProcesser(action_table=model.action_table, **game_params['obs_processer']) env = environment.Environment(sc_env, obs_processer, seed) # initial rollout starts here env_output = env.initial() with torch.no_grad(): agent_output = model.actor_step(env_output) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: if key not in ['sc_env_action' ]: # no need to save this key on buffers buffers[key][index][0, ...] = agent_output[key] # Do new rollout. for t in range(flags.unroll_length): timings.reset() env_output = env.step(agent_output["sc_env_action"]) timings.time("step") with torch.no_grad(): agent_output = model.actor_step(env_output) timings.time("model") #env_output = env.step(agent_output["sc_env_action"]) #timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: if key not in ['sc_env_action' ]: # no need to save this key on buffers buffers[key][index][t + 1, ...] = agent_output[key] # env_output will be like # s_{0}, ..., s_{T} # act_mask_{0}, ..., act_mask_{T} # discount_{0}, ..., discount_{T} # r_{-1}, ..., r_{T-1} # agent_output will be like # a_0, ..., a_T with a_t ~ pi(.|s_t) # log_pi(a_0|s_0), ..., log_pi(a_T|s_T) # so the learner can use (s_i, act_mask_i) to predict log_pi_i timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def test(flags): if flags.xpid is None: checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, "latest", "model.tar"))) elif ".tar" in flags.xpid: checkpointpath = os.path.expandvars(os.path.expanduser(flags.xpid)) else: checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) if len(flags.env.split(",")) != 1: raise Exception("Only one environment allowed for testing") # load the original arguments for the loaded network flags_orig = file_writer.read_metadata( re.sub(r"model.*tar", "meta.json", checkpointpath).replace("/intermediate", "")) args_orig = flags_orig["args"] agent_type = args_orig.get("agent_type", "resnet") num_actions = args_orig.get("num_actions", 6) num_tasks = args_orig.get("num_tasks", 1) use_lstm = args_orig.get("use_lstm", False) use_popart = args_orig.get("use_popart", False) reward_clipping = args_orig.get("reward_clipping", "abs_one") frame_width = args_orig.get("frame_width", 84) frame_height = args_orig.get("frame_height", 84) aaa_input_format = args_orig.get("aaa_input_format", "gray_stack") # set the right agent class if agent_type.lower() in [ "aaa", "attention_augmented", "attention_augmented_agent" ]: Net = AttentionAugmentedAgent logging.info("Using the Attention-Augmented Agent architecture.") agent_type = "aaa" elif agent_type.lower() in ["rn", "res", "resnet", "res_net"]: Net = ResNet logging.info("Using the ResNet architecture (monobeast version).") agent_type = "resnet" else: Net = AtariNet logging.warning( "No valid agent type specified. Using the default agent.") agent_type = "default" # check if the full action space should be used full_action_space = False if flags.num_actions == 18: full_action_space = True # create the environment gym_env = create_env(flags.env, frame_height=frame_height, frame_width=frame_width, gray_scale=(agent_type != "aaa" or aaa_input_format == "gray_stack"), full_action_space=full_action_space) env = environment.Environment(gym_env) # create the model and load its parameters model = Net(observation_shape=gym_env.observation_space.shape, num_actions=num_actions, num_tasks=num_tasks, use_lstm=use_lstm, use_popart=use_popart, reward_clipping=reward_clipping, rgb_last=(agent_type == "aaa" and aaa_input_format == "rgb_last")) model.eval() checkpoint = torch.load(checkpointpath, map_location="cpu") if 'baseline.mu' not in checkpoint["model_state_dict"]: checkpoint["model_state_dict"]["baseline.mu"] = torch.zeros(1) checkpoint["model_state_dict"]["baseline.sigma"] = torch.ones(1) model.load_state_dict(checkpoint["model_state_dict"]) observation = env.initial() returns = [] while len(returns) < flags.num_episodes: if flags.mode == "test_render": time.sleep(0.05) env.gym_env.render() agent_outputs = model(observation) policy_outputs, _ = agent_outputs observation = env.step(policy_outputs["action"]) if observation["done"].item(): returns.append(observation["episode_return"].item()) logging.info( "Episode ended after %d steps. Return: %.1f", observation["episode_step"].item(), observation["episode_return"].item(), ) env.close() logging.info("Average returns over %i steps: %.1f", flags.num_episodes, sum(returns) / len(returns))
def act( flags, env: str, task: int, full_action_space: bool, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. # create the environment from command line parameters # => could also create a special one which operates on a list of games (which we need) gym_env = create_env( env, frame_height=flags.frame_height, frame_width=flags.frame_width, gray_scale=(flags.aaa_input_format == "gray_stack"), full_action_space=full_action_space, task=task) # generate a seed for the environment (NO HUMAN STARTS HERE!), could just # use this for all games wrapped by the environment for our application seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) # wrap the environment, this is actually probably the point where we could # use multiple games, because the other environment is still one from Gym env = environment.Environment(gym_env) # get the initial frame, reward, done, return, step, last_action env_output = env.initial() # perform the first step agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: # get a buffer index from the queue for free buffers (?) index = free_queue.get() # termination signal (?) for breaking out of this loop if index is None: break # Write old rollout end. # the keys here are (frame, reward, done, episode_return, episode_step, last_action) for key in env_output: buffers[key][index][0, ...] = env_output[key] # here the keys are (policy_logits, baseline, action) for key in agent_output: buffers[key][index][0, ...] = agent_output[key] # I think the agent_state is just the RNN/LSTM state (which will be the "initial" state for the next step) # not sure why it's needed though because it really just seems to be the initial state before starting to # act; however, it might be randomly initialised, which is why we might want it... for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout for t in range(flags.unroll_length): timings.reset() # forward pass without keeping track of gradients to get the agent action with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) timings.time("model") # agent acting in the environment env_output = env.step(agent_output["action"]) timings.time("step") # writing the respective outputs of the current step (see above for the list of keys) for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") # after finishing a trajectory put the index in the "full queue", # presumably so that the data can be processed/sent to the learner full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def test(flags): num_episodes = flags.episodes if flags.xpid is None: checkpointpath = "./latest/model.tar" else: log_path = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, flags.env))) if not os.path.exists(log_path): os.mkdir(log_path) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) gym_env = create_gymenv(flags) if flags.agent in ["CNN", "SNLM", "MHA"]: env = environment.Environment(gym_env, "image") elif flags.agent in ["NLM", "KBMLP", "GCN"]: env = environment.Environment(gym_env, "absVKB") model = create_model(flags, gym_env) model.eval() checkpoint = torch.load(checkpointpath, map_location="cpu") model.load_state_dict(checkpoint["model_state_dict"]) observation = env.initial() returns = [] total_steps = 0 obs_index = 0 if flags.store_stats: stats = dict(episode=[], total_steps=[], reward=[], action=[], obs_index=[]) evals = [[] for _ in range(3)] obs = [] while len(returns) < num_episodes: if flags.mode == "test_render": env.gym_env.render() agent_outputs = model(observation) policy_outputs = agent_outputs observation = env.step(policy_outputs["action"]) if flags.store_stats: frame = observation["frame"].numpy()[0, 0] if not obs or np.any(obs[-1] != frame): if "evaluation" in policy_outputs: evaluation = policy_outputs["evaluation"] for i, eval in enumerate(evals): eval.append(evaluation[i].detach().numpy()[0]) obs.append(frame) else: obs_index -= 1 stats["episode"].append(len(returns)) stats["total_steps"].append(total_steps) stats["obs_index"].append(obs_index) stats["reward"].append(observation["reward"].numpy()[0, 0]) stats["action"].append(policy_outputs["action"].numpy()[0, 0]) if observation["done"].item(): returns.append(observation["episode_return"].item()) logging.info( "Episode ended after %d steps. Return: %.2f", observation["episode_step"].item(), observation["episode_return"].item(), ) #index_img = vector2index_img(observation["frame"]) #render_index_img(gym_env.get_index_img()) #print(str(env.gym_env)) #print("-"*15) #time.sleep(0.1) total_steps += 1 obs_index += 1 env.close() if flags.store_stats: if "evaluation" in policy_outputs: for i, eval in enumerate(evals): np.save(log_path + f"/eval-{i}-arity.npy", np.stack(eval)) np.save(log_path + "/obs.npy", np.stack(obs)) pd.DataFrame(stats).to_csv(log_path + "/stats.csv") mean = sum(returns) / len(returns) std = np.std(returns) logging.info("Average returns over %i steps: %.2f ± %.2f", num_episodes, mean, std) env_name = flags.env.replace("MiniGrid-", "").replace("-v0", "") if flags.env in ["rtfm", "rtfm-onehop"]: wins = np.array(returns) > -1.0 win_rate = np.mean(wins) * 100 win_std = np.std(wins) * 100 print(f"{mean:.2f} ± {std:.2f}, {win_rate:.2f} ± {win_std:.2f}") else: print(f"{mean:.2f} ± {std:.2f}") return mean, std
def act(flags, gym_env, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, buffers: Buffers, actor_buffers: Buffers, actor_model_queues: List[mp.SimpleQueue], actor_env_queues: List[mp.SimpleQueue]): try: logging.info("Actor %i started.", actor_index) timings = prof.Timings() # Keep track of how fast things are. gym_env = gym_env #seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") #gym_env.seed(seed) if flags.agent in ["CNN"]: env = environment.Environment(gym_env, "image") elif flags.agent in ["NLM", "KBMLP", "GCN"]: if flags.state in ["relative", "integer", "block"]: env = environment.Environment(gym_env, "VKB") elif flags.state == "absolute": env = environment.Environment(gym_env, "absVKB") env_output = env.initial() for key in env_output: actor_buffers[key][actor_index][0] = env_output[key] while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in actor_buffers: buffers[key][index][0] = actor_buffers[key][actor_index][0] # Do new rollout. for t in range(flags.unroll_length): timings.reset() actor_model_queues[actor_index].put(actor_index) env_info = actor_env_queues[actor_index].get() if env_info == "exit": return timings.time("model") env_output = env.step(actor_buffers["action"][actor_index][0]) timings.time("step") for key in actor_buffers: buffers[key][index][t + 1] = actor_buffers[key][actor_index][0] for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in env_output: actor_buffers[key][actor_index][0] = env_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e