def load_weights(self, path, env_cls=None, env_params=None): """Load previously fitted weights.""" if env_cls is not None: self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) tf.reset_default_graph() self._init_graph() self.saver = tf.train.Saver() self.saver.restore(self.session, path) print("Model restored from {}.".format(path))
def run(self): """Start training on the environment and sending updates to the global agent.""" self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(self.env_cls, self.env_params) self._init_graph() self.env = make_env(self.env_cls, self.env_params) self.done = True # force reset of the environment self.local_episode_counter = 0 while self.global_T.value < self.total_steps: self.run_session() self.local_episode_counter += 1
def fit(self, env_cls, env_params=None, tmax=32, total_steps=50000, n_workers=-1, restart=True, verbose=True): """Train the A3C Agent on the environment. Parameters ---------- env_cls: uninitialized Python class or str The environment to train on. If a class is provided, it must be uninitialized, so that it can be initialized in each worker agent. This is because not all environment are picklable and can therefore not be send to different Python processes. If a string is provided, this string is fed to `gym.make()` to create the environment. env_params: dict, optional, default=None Dictionary of parameter values to pass to `env_cls` upon initialization. total_steps: int, optional, default=50,000 The total number of training steps of all workers together. tmax: int, optional, default=32 The maximum number of steps for a worker to run before calculating gradients and updating the global model if the episode is not over earlier. n_workers: int, optional, default=-1 The number of worker processes to use. If set to -1, uses all but one of the available cores. restart: boolean, default=True Whether to (re-)initialize the network (True) or to keep the current neural network parameters (False). """ if isinstance(env_params, dict): env_params = env_params elif env_params is None: env_params = {} else: raise ValueError("env_params must be either a dict or None, got {}" .format(type(env_params))) self.tmax = tmax # obtain action and observation space information self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) # init graph, continue training, or use loaded graph if restart: self.total_steps = total_steps self._init_graph() else: try: self.total_steps += total_steps except AttributeError: # model was loaded, steps start from zero self.total_steps = total_steps # determine number of workers if n_workers == -1: n_workers = mp.cpu_count() - 1 print("CPU count: {}".format(mp.cpu_count())) # create multiprocessing communication objects global_queue = mp.Queue(self.max_queue_size) global_T = mp.Value("i", 0) initial_weights = self.get_weights() weight_shapes = [w.shape for w in initial_weights] self.weight_shapes = weight_shapes print("Shapes of trainable weights: {}".format(weight_shapes)) shared_weights = [mp.RawArray("d", w.reshape(-1)) for w in initial_weights] numpy_weights = [np.frombuffer(w, dtype=np.float64).reshape(weight_shapes[i]) for i, w in enumerate(shared_weights)] # create the worker agents agents = [ A3CWorker(env_cls, global_T, global_queue, shared_weights, weight_shapes, self.policy, tmax=self.tmax, total_steps=total_steps, name="A3C_Worker_{}".format(i), beta=self.beta, gamma=self.gamma, returns=self.return_func, td_steps=self.td_steps, learning_rate=self.learning_rate, logdir=self.logdir, env_params=env_params, **self.model_params) for i in range(n_workers) ] # start training for agent in agents: agent.start() print("Workers started.") try: # receive and apply gradients while running while True: try: message = global_queue.get(block=True, timeout=3) except queue.Empty: if global_T.value >= total_steps: break else: if isinstance(message, bytes): # received gradients, apply them gradients = pickle.loads(message) self.apply_gradients(gradients) new_weights = self.get_weights() for i in range(len(new_weights)): np.copyto(numpy_weights[i], new_weights[i]) print("\rGlobal steps: {}".format(global_T.value), end="") elif isinstance(message, str): print(message) else: print("Queue received unidentified message of type {}" .format(type(message))) except KeyboardInterrupt: global_T = total_steps # make workers think we are done for agent in agents: agent.join()
def evaluate(self, env_cls, n_episodes=10000, tmax=None, policy=None, env_params=None, init=False): """Evaluate the agent on an environemt without training. Parameters ---------- env_cls: uninitialized Python class or str The environment to train on. If a class is provided, it must be uninitialized. Parameters can be passed to the environment using env_params. If a string is provided, this string is fed to `gym.make()` to create the environment. n_episodes: int, optional, default=10,000 The number of episodes to run. tmax: int, optional, default=None The maximum number of steps to run in each episode. If None, set to 10,000 to not enforce a limit in most environments. policy: spyro.policies instance, default=None The policy to use during evaluation if it is not the same as during training. env_params: dict, optional, default=None Dictionary of parameter values to pass to `env_cls` upon initialization. init: boolean, default=False Whether to (re-)initialize the network (True) or to keep the current neural network parameters (False). """ if policy is not None: self.eval_policy = policy else: self.eval_policy = self.policy if tmax is None: self.tmax = 10000 else: self.tmax = tmax self.env = make_env(env_cls, env_params) self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) if init: tf.reset_default_graph() self._init_graph() self.episode_counter = 0 self.step_counter = 0 self.done = True self.eval_results = { "total_episode_reward": np.zeros(n_episodes), "mean_episode_reward": np.zeros(n_episodes), "episode_length": np.zeros(n_episodes), } for ep in range(n_episodes): self.state = np.asarray(self.env.reset(), dtype=np.float64) self.episode_step_counter = 0 self.episode_reward = 0 for i in range(self.tmax): # predict Q-values Q(s,a) qvalues = self.session.run(self.online_qvalues, feed_dict={ self.states_ph: np.reshape(self.state, (1, -1)) }) # select and perform action self.action = self.eval_policy.select_action( qvalues.reshape(-1)) new_state, self.reward, self.done, _ = self.env.step( self.action) # bookkeeping self.step_counter += 1 self.episode_reward += self.reward self.episode_step_counter += 1 self.state = np.asarray(copy.copy(new_state), dtype=np.float64) # end of episode if self.done: break self.eval_results["total_episode_reward"][ep] = self.episode_reward self.eval_results["mean_episode_reward"][ ep] = self.episode_reward / self.episode_step_counter self.eval_results["episode_length"][ep] = self.episode_step_counter progress("Completed episode {}/{}".format(ep + 1, n_episodes), same_line=(ep > 0), newline_end=(ep + 1 == n_episodes)) return self.eval_results
def fit(self, env_cls, total_steps=4e7, warmup_steps=10000, tmax=None, env_params=None, restart=True): """Train the agent on a given environment. Parameters ---------- env_cls: uninitialized Python class or str The environment to train on. If a class is provided, it must be uninitialized. Parameters can be passed to the environment using env_params. If a string is provided, this string is fed to `gym.make()` to create the environment. total_steps: int, optional, default=50,000 The total number of training steps. warmup_steps: int, default=10000 The number of steps to take in the environment before starting to train the network. This is used to build up a replay buffer with varying experiences to ensure uncorrelated samples from the start of training. tmax: int, optional, default=32 The maximum number of steps to run in a single trial if the episode is not over earlier. After tmax steps, all end-of-episode tasks will be performed. env_params: dict, optional, default=None Dictionary of parameter values to pass to `env_cls` upon initialization. restart: boolean, default=True Whether to (re-)initialize the network (True) or to keep the current neural network parameters (False). """ self.warmup_steps = warmup_steps self.tmax = tmax self.env = make_env(env_cls, env_params) self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) if restart: tf.reset_default_graph() self.total_steps = total_steps self._init_graph() if self.use_target_network: self.hard_update_target_network() self.episode_counter = 0 self.step_counter = 0 self.done = True else: try: self.total_steps += total_steps except AttributeError: # counters do not exist, graph must be restored from disk # recreate the counters like on a normal restart self.total_steps = total_steps self.step_counter = 0 self.done = True self.episode_counter = 0 while self.step_counter < total_steps: self.run_session() self.episode_counter += 1 print("\rSteps completed: {}/{}".format(self.step_counter, self.total_steps), end="")
def evaluate(self, env_cls, n_episodes=10000, tmax=None, env_params=None): """Evaluate the agent on an environemt without training. Parameters ---------- env_cls: uninitialized Python class or str The environment to train on. If a class is provided, it must be uninitialized. Parameters can be passed to the environment using env_params. If a string is provided, this string is fed to `gym.make()` to create the environment. n_episodes: int, optional, default=10,000 The number of episodes to run. tmax: int, optional, default=None The maximum number of steps to run in each episode. If None, set to 10,000 to not enforce a limit in most environments. env_params: dict, optional, default=None Dictionary of parameter values to pass to `env_cls` upon initialization. """ if tmax is None: self.tmax = 10000 else: self.tmax = tmax self.env = make_env(env_cls, env_params) self.action_shape, self.n_actions, self.obs_shape, _ = \ obtain_env_information(env_cls, env_params) self.episode_counter = 0 self.step_counter = 0 self.done = True self.eval_results = { "total_episode_reward": np.zeros(n_episodes), "mean_episode_reward": np.zeros(n_episodes), "episode_length": np.zeros(n_episodes), } seen_states = {} for ep in range(n_episodes): self.state = np.asarray(self.env.reset(), dtype=np.int16) self.episode_step_counter = 0 self.episode_reward = 0 for i in range(self.tmax): # get relocations from dictionary if problem was solved before # otherwise solve it and save the results for next time try: relocations = seen_states[tuple( extract_vehicles_from_state(self.state))] except KeyError: relocations = self.get_relocations(self.state) seen_states[tuple(extract_vehicles_from_state( self.state))] = relocations # get origin if current destination is in the relocations to_from = {d['to']: d['from'] for d in relocations.values()} destination_area = extract_current_destination_area(self.state) origin_area = to_from[ destination_area] if destination_area in list( to_from.keys()) else None # select and perform action self.action = area_to_action(origin_area) new_state, self.reward, self.done, _ = self.env.step( self.action) # bookkeeping self.step_counter += 1 self.episode_reward += self.reward self.episode_step_counter += 1 self.state = np.asarray(copy.copy(new_state), dtype=np.int16) # end of episode if self.done: break self.eval_results["total_episode_reward"][ep] = self.episode_reward self.eval_results["mean_episode_reward"][ ep] = self.episode_reward / self.episode_step_counter self.eval_results["episode_length"][ep] = self.episode_step_counter progress("Completed episode {}/{}".format(ep + 1, n_episodes), same_line=(ep > 0), newline_end=(ep + 1 == n_episodes)) return self.eval_results