Ejemplo n.º 1
0
 def load_weights(self, path, env_cls=None, env_params=None):
     """Load previously fitted weights."""
     if env_cls is not None:
         self.action_shape, self.n_actions, self.obs_shape, _ = \
                     obtain_env_information(env_cls, env_params)
     tf.reset_default_graph()
     self._init_graph()
     self.saver = tf.train.Saver()
     self.saver.restore(self.session, path)
     print("Model restored from {}.".format(path))
Ejemplo n.º 2
0
    def run(self):
        """Start training on the environment and sending updates to the global agent."""
        self.action_shape, self.n_actions, self.obs_shape, _  = \
                obtain_env_information(self.env_cls, self.env_params)
        self._init_graph()
        self.env = make_env(self.env_cls, self.env_params)
        self.done = True  # force reset of the environment

        self.local_episode_counter = 0
        while self.global_T.value < self.total_steps:
            self.run_session()
            self.local_episode_counter += 1
Ejemplo n.º 3
0
    def fit(self, env_cls, env_params=None, tmax=32, total_steps=50000, n_workers=-1,
            restart=True, verbose=True):
        """Train the A3C Agent on the environment.

        Parameters
        ----------
        env_cls: uninitialized Python class or str
            The environment to train on. If a class is provided, it must be uninitialized,
            so that it can be initialized in each worker agent. This is because not all
            environment are picklable and can therefore not be send to different Python
            processes. If a string is provided, this string is fed to `gym.make()` to create
            the environment.
        env_params: dict, optional, default=None
            Dictionary of parameter values to pass to `env_cls` upon initialization.
        total_steps: int, optional, default=50,000
            The total number of training steps of all workers together.
        tmax: int, optional, default=32
            The maximum number of steps for a worker to run before calculating gradients
            and updating the global model if the episode is not over earlier.
        n_workers: int, optional, default=-1
            The number of worker processes to use. If set to -1, uses all but one of
            the available cores.
        restart: boolean, default=True
            Whether to (re-)initialize the network (True) or to keep the current neural
            network parameters (False).
        """
        if isinstance(env_params, dict):
            env_params = env_params
        elif env_params is None:
            env_params = {}
        else:
            raise ValueError("env_params must be either a dict or None, got {}"
                             .format(type(env_params)))

        self.tmax = tmax

        # obtain action and observation space information
        self.action_shape, self.n_actions, self.obs_shape, _  = \
                obtain_env_information(env_cls, env_params)

        # init graph, continue training, or use loaded graph
        if restart:
            self.total_steps = total_steps
            self._init_graph()
        else:
            try:
                self.total_steps += total_steps
            except AttributeError:
                # model was loaded, steps start from zero
                self.total_steps = total_steps

        # determine number of workers
        if n_workers == -1:
            n_workers = mp.cpu_count() - 1
            print("CPU count: {}".format(mp.cpu_count()))

        # create multiprocessing communication objects
        global_queue = mp.Queue(self.max_queue_size)
        global_T = mp.Value("i", 0)
        initial_weights = self.get_weights()
        weight_shapes = [w.shape for w in initial_weights]
        self.weight_shapes = weight_shapes
        print("Shapes of trainable weights: {}".format(weight_shapes))
        shared_weights = [mp.RawArray("d", w.reshape(-1)) for w in initial_weights]
        numpy_weights = [np.frombuffer(w, dtype=np.float64).reshape(weight_shapes[i])
                         for i, w in enumerate(shared_weights)]

        # create the worker agents
        agents = [
            A3CWorker(env_cls, global_T, global_queue, shared_weights, weight_shapes,
                      self.policy, tmax=self.tmax, total_steps=total_steps,
                      name="A3C_Worker_{}".format(i), beta=self.beta,
                      gamma=self.gamma, returns=self.return_func, td_steps=self.td_steps,
                      learning_rate=self.learning_rate, logdir=self.logdir,
                      env_params=env_params, **self.model_params)
            for i in range(n_workers)
        ]

        # start training
        for agent in agents:
            agent.start()

        print("Workers started.")
        try:
            # receive and apply gradients while running
            while True:
                try:
                    message = global_queue.get(block=True, timeout=3)
                except queue.Empty:
                    if global_T.value >= total_steps:
                        break
                else:
                    if isinstance(message, bytes):
                        # received gradients, apply them
                        gradients = pickle.loads(message)
                        self.apply_gradients(gradients)
                        new_weights = self.get_weights()
                        for i in range(len(new_weights)):
                            np.copyto(numpy_weights[i], new_weights[i])
                        print("\rGlobal steps: {}".format(global_T.value), end="")
                    elif isinstance(message, str):
                        print(message)
                    else:
                        print("Queue received unidentified message of type {}"
                              .format(type(message)))
        except KeyboardInterrupt:
            global_T = total_steps  # make workers think we are done

        for agent in agents:
            agent.join()
Ejemplo n.º 4
0
    def evaluate(self,
                 env_cls,
                 n_episodes=10000,
                 tmax=None,
                 policy=None,
                 env_params=None,
                 init=False):
        """Evaluate the agent on an environemt without training.

        Parameters
        ----------
        env_cls: uninitialized Python class or str
            The environment to train on. If a class is provided, it must be uninitialized.
            Parameters can be passed to the environment using env_params. If a string
            is provided, this string is fed to `gym.make()` to create the environment.
        n_episodes: int, optional, default=10,000
            The number of episodes to run.
        tmax: int, optional, default=None
            The maximum number of steps to run in each episode. If None, set to 10,000 to
            not enforce a limit in most environments.
        policy: spyro.policies instance, default=None
            The policy to use during evaluation if it is not the same as during training.
        env_params: dict, optional, default=None
            Dictionary of parameter values to pass to `env_cls` upon initialization.
        init: boolean, default=False
            Whether to (re-)initialize the network (True) or to keep the current neural
            network parameters (False).
        """
        if policy is not None:
            self.eval_policy = policy
        else:
            self.eval_policy = self.policy

        if tmax is None:
            self.tmax = 10000
        else:
            self.tmax = tmax

        self.env = make_env(env_cls, env_params)
        self.action_shape, self.n_actions, self.obs_shape, _ = \
                obtain_env_information(env_cls, env_params)

        if init:
            tf.reset_default_graph()
            self._init_graph()

        self.episode_counter = 0
        self.step_counter = 0
        self.done = True

        self.eval_results = {
            "total_episode_reward": np.zeros(n_episodes),
            "mean_episode_reward": np.zeros(n_episodes),
            "episode_length": np.zeros(n_episodes),
        }

        for ep in range(n_episodes):
            self.state = np.asarray(self.env.reset(), dtype=np.float64)
            self.episode_step_counter = 0
            self.episode_reward = 0

            for i in range(self.tmax):

                # predict Q-values Q(s,a)
                qvalues = self.session.run(self.online_qvalues,
                                           feed_dict={
                                               self.states_ph:
                                               np.reshape(self.state, (1, -1))
                                           })

                # select and perform action
                self.action = self.eval_policy.select_action(
                    qvalues.reshape(-1))
                new_state, self.reward, self.done, _ = self.env.step(
                    self.action)

                # bookkeeping
                self.step_counter += 1
                self.episode_reward += self.reward
                self.episode_step_counter += 1
                self.state = np.asarray(copy.copy(new_state), dtype=np.float64)

                # end of episode
                if self.done:
                    break

            self.eval_results["total_episode_reward"][ep] = self.episode_reward
            self.eval_results["mean_episode_reward"][
                ep] = self.episode_reward / self.episode_step_counter
            self.eval_results["episode_length"][ep] = self.episode_step_counter

            progress("Completed episode {}/{}".format(ep + 1, n_episodes),
                     same_line=(ep > 0),
                     newline_end=(ep + 1 == n_episodes))

        return self.eval_results
Ejemplo n.º 5
0
    def fit(self,
            env_cls,
            total_steps=4e7,
            warmup_steps=10000,
            tmax=None,
            env_params=None,
            restart=True):
        """Train the agent on a given environment.

        Parameters
        ----------
        env_cls: uninitialized Python class or str
            The environment to train on. If a class is provided, it must be uninitialized.
            Parameters can be passed to the environment using env_params. If a string
            is provided, this string is fed to `gym.make()` to create the environment.
        total_steps: int, optional, default=50,000
            The total number of training steps.
        warmup_steps: int, default=10000
            The number of steps to take in the environment before starting to train the
            network. This is used to build up a replay buffer with varying experiences
            to ensure uncorrelated samples from the start of training.
        tmax: int, optional, default=32
            The maximum number of steps to run in a single trial if the episode is not
            over earlier. After tmax steps, all end-of-episode tasks will be performed.
        env_params: dict, optional, default=None
            Dictionary of parameter values to pass to `env_cls` upon initialization.
        restart: boolean, default=True
            Whether to (re-)initialize the network (True) or to keep the current neural
            network parameters (False).
        """
        self.warmup_steps = warmup_steps
        self.tmax = tmax
        self.env = make_env(env_cls, env_params)
        self.action_shape, self.n_actions, self.obs_shape, _ = \
                obtain_env_information(env_cls, env_params)

        if restart:
            tf.reset_default_graph()
            self.total_steps = total_steps
            self._init_graph()
            if self.use_target_network:
                self.hard_update_target_network()

            self.episode_counter = 0
            self.step_counter = 0
            self.done = True
        else:
            try:
                self.total_steps += total_steps
            except AttributeError:
                # counters do not exist, graph must be restored from disk
                # recreate the counters like on a normal restart
                self.total_steps = total_steps
                self.step_counter = 0
                self.done = True
                self.episode_counter = 0

        while self.step_counter < total_steps:
            self.run_session()
            self.episode_counter += 1
            print("\rSteps completed: {}/{}".format(self.step_counter,
                                                    self.total_steps),
                  end="")
Ejemplo n.º 6
0
    def evaluate(self, env_cls, n_episodes=10000, tmax=None, env_params=None):
        """Evaluate the agent on an environemt without training.

        Parameters
        ----------
        env_cls: uninitialized Python class or str
            The environment to train on. If a class is provided, it must be uninitialized.
            Parameters can be passed to the environment using env_params. If a string
            is provided, this string is fed to `gym.make()` to create the environment.
        n_episodes: int, optional, default=10,000
            The number of episodes to run.
        tmax: int, optional, default=None
            The maximum number of steps to run in each episode. If None, set to 10,000 to
            not enforce a limit in most environments.
        env_params: dict, optional, default=None
            Dictionary of parameter values to pass to `env_cls` upon initialization.
        """
        if tmax is None:
            self.tmax = 10000
        else:
            self.tmax = tmax

        self.env = make_env(env_cls, env_params)
        self.action_shape, self.n_actions, self.obs_shape, _ = \
                obtain_env_information(env_cls, env_params)

        self.episode_counter = 0
        self.step_counter = 0
        self.done = True

        self.eval_results = {
            "total_episode_reward": np.zeros(n_episodes),
            "mean_episode_reward": np.zeros(n_episodes),
            "episode_length": np.zeros(n_episodes),
        }

        seen_states = {}
        for ep in range(n_episodes):
            self.state = np.asarray(self.env.reset(), dtype=np.int16)
            self.episode_step_counter = 0
            self.episode_reward = 0

            for i in range(self.tmax):

                # get relocations from dictionary if problem was solved before
                # otherwise solve it and save the results for next time
                try:
                    relocations = seen_states[tuple(
                        extract_vehicles_from_state(self.state))]
                except KeyError:
                    relocations = self.get_relocations(self.state)
                    seen_states[tuple(extract_vehicles_from_state(
                        self.state))] = relocations

                # get origin if current destination is in the relocations
                to_from = {d['to']: d['from'] for d in relocations.values()}
                destination_area = extract_current_destination_area(self.state)
                origin_area = to_from[
                    destination_area] if destination_area in list(
                        to_from.keys()) else None

                # select and perform action
                self.action = area_to_action(origin_area)
                new_state, self.reward, self.done, _ = self.env.step(
                    self.action)

                # bookkeeping
                self.step_counter += 1
                self.episode_reward += self.reward
                self.episode_step_counter += 1
                self.state = np.asarray(copy.copy(new_state), dtype=np.int16)

                # end of episode
                if self.done:
                    break

            self.eval_results["total_episode_reward"][ep] = self.episode_reward
            self.eval_results["mean_episode_reward"][
                ep] = self.episode_reward / self.episode_step_counter
            self.eval_results["episode_length"][ep] = self.episode_step_counter

            progress("Completed episode {}/{}".format(ep + 1, n_episodes),
                     same_line=(ep > 0),
                     newline_end=(ep + 1 == n_episodes))

        return self.eval_results