def load_trained_agent(dirpath, env_cls, env_params=None, **kwargs): """Load a pre-trained agent with its weights. Parameters ---------- dirpath: str The path to the directory in which the model parameters and agent config file are stored. env_cls: class or str The environment to train on, if a string is provided, it must be the name of a gym env. env_params: dict, default=None Key-value pairings to pass to env_cls if a class is provided. Returns ------- agent: spyro.agents.* An agent object with loaded / pre-trained weights. """ config_path = os.path.join(dirpath, "agent_config.json") agent = init_agent_from_config(config_path, **kwargs) agent.load_weights(os.path.join(dirpath, "model.ckpt"), env_cls=env_cls, env_params=env_params) progress("Agent's weights loaded.") return agent
def __init__(self, env_cls, result_queue, task_queue=None, stop_indicator=None, state_processor=None, max_values=None, strategy='random', env_params=None, timeout=5, verbose=False): super().__init__() self.env_cls = env_cls self.env_params = env_params self.task_queue = task_queue self.result_queue = result_queue self.state_processor = state_processor self.stop_indicator = stop_indicator self.strategy = strategy self.max_values = max_values self.timeout = timeout self.verbose = verbose if self.strategy == 'tasks': assert task_queue is not None, "Must provide a task_queue if strategy='tasks'" if self.strategy != 'tasks': assert stop_indicator is not None, "Must provide a stop_indicator if strategy!='tasks" if self.strategy == 'uniform': assert max_values is not None, "max_values must be provided when strategy='uniform'" progress("Worker initialized.", verbose=self.verbose)
def init_agent_from_config(config_path, force_no_log=False): """Initialize an agent based on a config file from a previous run. Parameters ---------- config_path: str The path to the config JSON file. force_no_log: bool, default=False If true, sets log=False in agent's init. Useful to prevent the new agent from logging in a subdirectory of the original logdir. """ # load config config = json.load(open(config_path, 'r')) # determine agent class agent_cls = AGENT_MAP[config["name"]] # set logging to False if specified if force_no_log: config["log"] = False del config["logdir"] # retrieve policy try: policy_config = config.pop("policy") has_policy = True policy_name = policy_config.pop("name") if policy_name == "EpsilonGreedyPolicy": del policy_config["epsilon"] policy = POLICY_MAP[policy_name](**policy_config) except KeyError: has_policy = False # retrieve memory try: memory_config = config.pop("memory") has_memory = True _ = memory_config.pop("name") memory = ReplayBuffer(**memory_config) except KeyError: has_memory = False # init agent if has_policy and has_memory: agent = agent_cls(policy, memory, **config) elif has_policy: agent = agent_cls(policy, **config) elif has_memory: agent = agent_cls(memory, **config) else: agent = agent_cls(**config) progress("Agent reconstructed from config.") return agent
def evaluate_saved_agent(dirpath, env_cls, n_episodes=100000, tmax=1000, policy=None, env_params=None, save=True, evaluator=None): """Load a trained and saved agent from disk and evaluate it on a test environment. Parameters ---------- dirpath: str The path to the directory in which the model parameters and agent config file are stored. env_cls: class or str The environment to train on, if a string is provided, it must be the name of a gym env. n_episodes: int, default=100000 The number of episodes to use for evaluation. tmax: int, default=1000 The maximum number of steps per episode. env_params: dict, default=None Key-value pairings to pass to env_cls if a class is provided. Returns ------- results: any Output of agent.evaluate. Usually, this is a dictionary with 'mean_episode_reward', 'total_episode_reward', and 'episode_length' as keys and numpy arrays as values. test_log: pd.DataFrame The simulation log of all tested episodes. """ agent = load_trained_agent(dirpath, env_cls, env_params=None, force_no_log=True) progress("Start test run on {} episodes.".format(n_episodes)) results = agent.evaluate(env_cls, n_episodes=n_episodes, tmax=tmax, policy=policy, env_params=env_params) test_log = agent.env.get_test_log() if save: progress("Saving results to {}.".format(dirpath)) pickle.dump(results, open(os.path.join(dirpath, "test_results_dict.pkl"), "wb")) test_log.to_csv(os.path.join(dirpath, "test_log.csv"), index=False) if evaluator is not None: progress("Extracting metrics using the evaluator") summary = evaluator.evaluate(test_log) else: summary = None progress("Evaluation completed.") return results, test_log, summary
def _run_tasks(self): """Start interacting with the environment to obtain specifically requested experiences (tasks) and send the results to the global queue. """ progress("Start peforming tasks.", verbose=self.verbose) self._make_env() while True: try: task = self.task_queue.get(timeout=1) self.perform_task(task) except queue.Empty: progress( "Empty task queue found at worker. Shutting down worker.", verbose=self.verbose) break
def _run_randomly(self): """Start interacting with the environment without manipulating the state in-between steps and send the result of each step to the global results queue. """ progress("Start obtaining experiences.", verbose=self.verbose) self._make_env() while self.stop_indicator.value != 1: # start episode by resetting env state = self.state_processor(self.env.reset()) done = False # gather experiences until episode end while not done: response, target = self.env._simulate() if (response is not None) and (response != np.inf): try: self.result_queue.put( { "state": state, "response": response, "target": target }, block=True, timeout=self.timeout) except queue.Full: progress( "Queue has been full for {} seconds. Breaking.". format(self.timeout), verbose=self.verbose) break raw_state, done = self.env._extract_state( self.env._get_available_vehicles()) state = self.state_processor(raw_state)
def merge_tables(*tables, to_quantiles=True, key="responses", num_quantiles=51, save_path=None): """Merge multiple tables into one big one. Parameters ---------- tables: dict The tables to merge. Should all have the same set of keys / states. to_quantiles: bool, default=True Whether to calculate quantiles over the obtained values rather than keep the raw ones. key: str, default="responses" If to_quantiles=True, the key is the key in the inner dictionary that points to the array over which to compute quantiles. If to_quantiles=False, key is the (list of) keys for which arrays of different tables should be appended. num_quantiles: int, default=51 The number of quantiles to compute when to_quantiles=True. save_path: str, default=None The path to save the resulting table. If None, does not save. Returns ------- merged_table: dict The merged table. """ assert len(tables) > 1, "Must provide more than one table" assert set(tables[0].keys()) == set( tables[0].keys()), "Keys are not the same for all tables" merged = tables[0] for i, state in enumerate(tables[0].keys()): progress("Merging results for state {} / {}.".format( i + 1, len(merged)), same_line=True, newline_end=(i + 1 == len(merged))) merged[state] = append_arrays_in_dicts( *[t[state] for t in tables], keys=key if isinstance(key, list) else [key]) if to_quantiles: progress("Obtaining quantiles for '{}''".format(key)) merged = get_table_quantiles(merged, num_quantiles=num_quantiles, inner_key=key) if save_path is not None: pickle.dump(merged, open(save_path, "wb")) progress("Merged table save at {}".format(save_path)) return merged
def gather_random_experiences(self, env_cls, total_steps=50000000, start_step=0, env_params=None, strategy='random', timeout=3): """Collect random experiences from parallel workers. Parameters ---------- env_cls: Python class The environment to train on. total_steps: int, default=50000000 The total number of experiences to gather. env_params: dict, default=None Parameters passed to env_cls upon initialization. timeout: int, default=3 The maximum time to wait for an item in the results queue if it is empty. """ self.stop_indicator = mp.Value("i", 0) self.global_counter = start_step total_steps = total_steps + start_step self.result_queue = mp.Queue(self.max_queue_size) # initialize workers workers = [ ExperienceGatheringProcess(env_cls, self.result_queue, stop_indicator=self.stop_indicator, env_params=env_params, state_processor=self.state_processor, max_values=self.max_values, strategy=self.strategy) for _ in range(self.num_workers) ] for worker in workers: worker.start() # wait for workers to start delivering time.sleep(5) try: while True: try: experience = self.result_queue.get(block=True, timeout=timeout) self.process_random_experience(experience) self.global_counter += 1 progress("Processed {} / {} experiences".format( self.global_counter, total_steps), same_line=True, newline_end=False, verbose=self.global_counter % 1000 == 0) except queue.Empty: progress("\nQueue is empty. Breaking loop.", verbose=self.verbose) break if self.global_counter >= total_steps: if self.stop_indicator.value == 0: with self.stop_indicator.get_lock(): self.stop_indicator.value = 1 progress( "\nSent stop signal to workers. Processing last results in queue.", verbose=self.verbose) except KeyboardInterrupt: progress( "KeyboardInterrupt: sending stop signal and waiting for workers.", verbose=self.verbose) with self.stop_indicator.get_lock(): self.stop_indicator.value = 1 for worker in workers: if worker.is_alive(): worker.join() progress("Workers stopped gracefully.", verbose=self.verbose)
def perform_tasks(self, env_cls, reps=100, env_params=None, timeout=10, debug_subset=None): """Gather experiences. Parameters ---------- env_cls: Python class The environment to gather experiences from. This class was designed for FireCommanderV2, but similar environments might work as well. include_time: bool, default=False Whether to include day of the week and hour of the day in the state representation. Note: setting to True significantly increases the number of available states, and thus run time. reps: int, default=100 The number of repetitions/experiences to gather for each state. env_params: dict, default=None Key-value pairs passed to env_cls. timeout: int, default=10 The maximum time to wait for workers to produce results. After timeout seconds, the main process stops getting results from the queue and wraps up the other processes. """ # define tasks and put them in a global queue tasks = self.define_tasks(reps=reps, debug_subset=debug_subset) self.global_counter = 0 self.num_tasks = len(tasks) self.task_queue = mp.Queue() self.result_queue = mp.Queue() _ = list(map(self.task_queue.put, tasks)) progress("Put {} tasks in Queue (queue length: {})".format( self.num_tasks, self.task_queue.qsize()), verbose=self.verbose) # initialize workers workers = [ ExperienceGatheringProcess(env_cls, self.result_queue, task_queue=self.task_queue, env_params=env_params, state_processor=self.state_processor, strategy='tasks') for _ in range(self.num_workers) ] for worker in workers: worker.start() try: while True: try: performed_task = self.result_queue.get(block=True, timeout=timeout) self.process_performed_task(performed_task) self.global_counter += 1 progress("performed {} / {} tasks".format( self.global_counter, self.num_tasks), same_line=True, newline_end=False, verbose=self.verbose) except queue.Empty: progress("\nQueue is empty. Breaking loop.", verbose=self.verbose) break except KeyboardInterrupt: pass for worker in workers: if worker.is_alive(): worker.join()
def evaluate(self, env_cls, n_episodes=10000, tmax=None, policy=None, env_params=None, init=False): """Evaluate the agent on an environemt without training.""" if policy is not None: self.eval_policy = policy else: self.eval_policy = self.policy if tmax is None: self.tmax = 1000000 else: self.tmax = tmax self.env = make_env(env_cls, env_params) self.action_shape, self.n_actions = get_space_shape(self.env.action_space) self.obs_shape, _ = get_space_shape(self.env.observation_space) print("Environment initialized.") if init: tf.reset_default_graph() self._init_graph() print("Graph created.") self.episode_counter = 0 self.step_counter = 0 self.done = True self.eval_results = { "total_episode_reward": np.zeros(n_episodes), "mean_episode_reward": np.zeros(n_episodes), "episode_length": np.zeros(n_episodes), } for ep in range(n_episodes): self.state = np.asarray(self.env.reset(), dtype=np.float64) self.episode_step_counter = 0 self.episode_reward = 0 for i in range(self.tmax): # predict Q-values Q(s,a) action_probabilities = self.session.run( self.action_probs, feed_dict={self.state_ph: np.reshape(self.state, (1, -1))} ) # select and perform action self.action = self.eval_policy.select_action(action_probabilities.reshape(-1)) new_state, self.reward, self.done, _ = self.env.step(self.action) # bookkeeping self.step_counter += 1 self.episode_reward += self.reward self.episode_step_counter += 1 self.state = np.asarray(copy.copy(new_state), dtype=np.float64) # end of episode if self.done: break self.eval_results["total_episode_reward"][ep] = self.episode_reward self.eval_results["mean_episode_reward"][ep] = self.episode_reward / self.episode_step_counter self.eval_results["episode_length"][ep] = self.episode_step_counter progress("Completed episode {}/{}".format(ep + 1, n_episodes), same_line=(ep > 0), newline_end=(ep + 1 == n_episodes)) return self.eval_results
def evaluate(self, env_cls, n_episodes=10000, tmax=None, policy=None, env_params=None, init=False): """Evaluate the agent on an environemt without training. Parameters ---------- env_cls: uninitialized Python class or str The environment to train on. If a class is provided, it must be uninitialized. Parameters can be passed to the environment using env_params. If a string is provided, this string is fed to `gym.make()` to create the environment. n_episodes: int, optional, default=10,000 The number of episodes to run. tmax: int, optional, default=None The maximum number of steps to run in each episode. If None, set to 10,000 to not enforce a limit in most environments. policy: spyro.policies instance, default=None The policy to use during evaluation if it is not the same as during training. env_params: dict, optional, default=None Dictionary of parameter values to pass to `env_cls` upon initialization. init: boolean, default=False Whether to (re-)initialize the network (True) or to keep the current neural network parameters (False). """ if policy is not None: self.eval_policy = policy else: self.eval_policy = self.policy if tmax is None: self.tmax = 10000 else: self.tmax = tmax self.env = make_env(env_cls, env_params) self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) if init: tf.reset_default_graph() self._init_graph() self.episode_counter = 0 self.step_counter = 0 self.done = True self.eval_results = { "total_episode_reward": np.zeros(n_episodes), "mean_episode_reward": np.zeros(n_episodes), "episode_length": np.zeros(n_episodes), } for ep in range(n_episodes): self.state = np.asarray(self.env.reset(), dtype=np.float64) self.episode_step_counter = 0 self.episode_reward = 0 for i in range(self.tmax): # predict Q-values Q(s,a) qvalues = self.session.run(self.online_qvalues, feed_dict={ self.states_ph: np.reshape(self.state, (1, -1)) }) # select and perform action self.action = self.eval_policy.select_action( qvalues.reshape(-1)) new_state, self.reward, self.done, _ = self.env.step( self.action) # bookkeeping self.step_counter += 1 self.episode_reward += self.reward self.episode_step_counter += 1 self.state = np.asarray(copy.copy(new_state), dtype=np.float64) # end of episode if self.done: break self.eval_results["total_episode_reward"][ep] = self.episode_reward self.eval_results["mean_episode_reward"][ ep] = self.episode_reward / self.episode_step_counter self.eval_results["episode_length"][ep] = self.episode_step_counter progress("Completed episode {}/{}".format(ep + 1, n_episodes), same_line=(ep > 0), newline_end=(ep + 1 == n_episodes)) return self.eval_results
def evaluate(self, env_cls, n_episodes=10000, tmax=None, env_params=None): """Evaluate the agent on an environemt without training. Parameters ---------- env_cls: uninitialized Python class or str The environment to train on. If a class is provided, it must be uninitialized. Parameters can be passed to the environment using env_params. If a string is provided, this string is fed to `gym.make()` to create the environment. n_episodes: int, optional, default=10,000 The number of episodes to run. tmax: int, optional, default=None The maximum number of steps to run in each episode. If None, set to 10,000 to not enforce a limit in most environments. env_params: dict, optional, default=None Dictionary of parameter values to pass to `env_cls` upon initialization. """ if tmax is None: self.tmax = 10000 else: self.tmax = tmax self.env = make_env(env_cls, env_params) self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) self.episode_counter = 0 self.step_counter = 0 self.done = True self.eval_results = { "total_episode_reward": np.zeros(n_episodes), "mean_episode_reward": np.zeros(n_episodes), "episode_length": np.zeros(n_episodes), } seen_states = {} for ep in range(n_episodes): self.state = np.asarray(self.env.reset(), dtype=np.int16) self.episode_step_counter = 0 self.episode_reward = 0 for i in range(self.tmax): # get relocations from dictionary if problem was solved before # otherwise solve it and save the results for next time try: relocations = seen_states[tuple( extract_vehicles_from_state(self.state))] except KeyError: relocations = self.get_relocations(self.state) seen_states[tuple(extract_vehicles_from_state( self.state))] = relocations # get origin if current destination is in the relocations to_from = {d['to']: d['from'] for d in relocations.values()} destination_area = extract_current_destination_area(self.state) origin_area = to_from[ destination_area] if destination_area in list( to_from.keys()) else None # select and perform action self.action = area_to_action(origin_area) new_state, self.reward, self.done, _ = self.env.step( self.action) # bookkeeping self.step_counter += 1 self.episode_reward += self.reward self.episode_step_counter += 1 self.state = np.asarray(copy.copy(new_state), dtype=np.int16) # end of episode if self.done: break self.eval_results["total_episode_reward"][ep] = self.episode_reward self.eval_results["mean_episode_reward"][ ep] = self.episode_reward / self.episode_step_counter self.eval_results["episode_length"][ep] = self.episode_step_counter progress("Completed episode {}/{}".format(ep + 1, n_episodes), same_line=(ep > 0), newline_end=(ep + 1 == n_episodes)) return self.eval_results
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import pickle from spyro.utils import progress from spyro.value_estimation import STATION_NAMES try: from fdsim.helpers import lonlat_to_xy except: progress("fdsim not installed, some functions might not work.") try: import geopandas as gpd except: progress('geopandas not installed, some functions might not work.') def set_sns_params(font_scale=1.2, **kwargs): sns.set(font_scale=font_scale, **kwargs) def quantile_range(num_quantiles=50): """Generate evenly spaced values in (0, 1) that can be used as quantile-positions. Parameters ---------- num_quantiles: int, default=50 The number of quantile-positions to generate. """