Example #1
0
 def _compute(self, tolerance, verbose=False):
     start_time = time.time()
     num_optimization_steps = 1000
     value_functions = avf_values(
         self.env,
         num_optimization_steps=num_optimization_steps,
         num_tasks=self.num_tasks)
     value_functions = np.squeeze(value_functions)
     value_functions = np.atleast_2d(value_functions)
     self.metric = np.zeros((self.num_states, self.num_states))
     for s in range(self.num_states):
         # We take advantage of symmetry for faster computation.
         for t in range(s + 1, self.num_states):
             max_difference = 0.0
             for policy in range(value_functions.shape[0]):
                 for a in range(self.num_actions):
                     q1 = (self.env.rewards[s, a] + self.env.gamma *
                           np.matmul(self.env.transition_probs[s, a, :],
                                     value_functions[policy, :]))
                     q2 = (self.env.rewards[t, a] + self.env.gamma *
                           np.matmul(self.env.transition_probs[t, a, :],
                                     value_functions[policy, :]))
                     action_diff = abs(q1 - q2)
                     if action_diff > max_difference:
                         max_difference = action_diff
             self.metric[s, t] = max_difference
             self.metric[t, s] = max_difference
     # We don't really have a sampled versiion of this.
     total_time = time.time() - start_time
     self.statistics = metric.Statistics(0., total_time,
                                         num_optimization_steps, 0.)
 def _compute(self, tolerance, verbose=False):
     del tolerance
     assert self.env.q_values is not None, 'Q-Values have not been computed.'
     self.metric = np.max(np.abs(self.env.q_values[:, None, :] -
                                 self.env.q_values[None, :, :]),
                          axis=-1)
     self.statistics = metric.Statistics(0., 0., 0, 0.)
Example #3
0
  def _compute(self, tolerance, verbose=False):
    """Compute exact/online lax-bisimulation metric up to specified tolerance.

    Args:
      tolerance: float, maximum difference in metric estimate between successive
        iterations. Once this threshold is past, computation stops.
      verbose: bool, whether to print verbose messages.
    """
    # Initial metric is all zeros.
    curr_metric = np.zeros((self.num_states, self.num_states))
    metric_difference = tolerance * 2.
    i = 1
    exact_metric_differences = []
    start_time = time.time()
    while metric_difference > tolerance:
      new_metric = np.zeros((self.num_states, self.num_states))
      state_action_metric = np.zeros((self.num_states, self.num_actions,
                                      self.num_states, self.num_actions))
      for s in range(self.num_states):
        for t in range(self.num_states):
          for a in range(self.num_actions):
            for b in range(self.num_actions):
              next_state_distrib_1 = self.env.transition_probs[s, a, :]
              next_state_distrib_2 = self.env.transition_probs[t, b, :]
              rew1 = self.env.rewards[s, a]
              rew2 = self.env.rewards[t, b]
              emd = ot.emd2(
                  next_state_distrib_1, next_state_distrib_2, curr_metric)
              state_action_metric[s, a, t, b] = (
                  abs(rew1 - rew2) + self.gamma * emd)
      # Now that we've updated the state-action metric, we compute the Hausdorff
      # metric.
      for s in range(self.num_states):
        for t in range(s + 1, self.num_states):
          # First we find \sup_x\inf_y d(x, y) from Definition 5 in paper.
          max_a = None
          for a in range(self.num_actions):
            min_b = np.min(state_action_metric[s, a, t, :])
            if max_a is None or min_b > max_a:
              max_a = min_b
          # Next we find \sup_y\inf_x d(x, y) from Definition 5 in paper.
          max_b = None
          for b in range(self.num_actions):
            min_a = np.min(state_action_metric[s, :, t, b])
            if max_b is None or min_a > max_b:
              max_b = min_a
          new_metric[s, t] = max(max_a, max_b)
          new_metric[t, s] = new_metric[s, t]
      metric_difference = np.max(abs(new_metric - curr_metric))
      exact_metric_differences.append(metric_difference)
      if verbose:
        logging.info('Iteration %d: %f', i, metric_difference)
      curr_metric = np.copy(new_metric)
      i += 1
    total_time = time.time() - start_time
    self.metric = curr_metric
    self.statistics = metric.Statistics(
        tolerance, total_time, i, exact_metric_differences)
    def _compute(self, tolerance, verbose=False):
        """Compute exact/online bisimulation metric up to the specified tolerance.

    Args:
      tolerance: float, maximum difference in metric estimate between successive
        iterations. Once this threshold is past, computation stops.
      verbose: bool, whether to print verbose messages.
    """
        # Initial metric is all zeros.
        curr_metric = np.zeros((self.num_states, self.num_states))
        metric_difference = tolerance * 2.
        i = 1
        exact_metric_differences = []
        start_time = time.time()
        while metric_difference > tolerance:
            new_metric = np.zeros((self.num_states, self.num_states))
            for s in range(self.num_states):
                for t in range(self.num_states):
                    for a in range(self.num_actions):
                        next_state_distrib_1 = self.env.transition_probs[s,
                                                                         a, :]
                        next_state_distrib_2 = self.env.transition_probs[t,
                                                                         a, :]
                        rew1 = self.env.rewards[s, a]
                        rew2 = self.env.rewards[t, a]
                        emd = ot.emd2(next_state_distrib_1,
                                      next_state_distrib_2, curr_metric)
                        act_distance = abs(rew1 - rew2) + self.gamma * emd
                        if act_distance > new_metric[s, t]:
                            new_metric[s, t] = act_distance
            metric_difference = np.max(abs(new_metric - curr_metric))
            exact_metric_differences.append(metric_difference)
            if verbose:
                logging.info('Iteration %d: %f', i, metric_difference)
            curr_metric = np.copy(new_metric)
            i += 1
        total_time = time.time() - start_time
        self.metric = curr_metric
        self.statistics = metric.Statistics(tolerance, total_time, i,
                                            exact_metric_differences)
  def _compute(self, tolerance, verbose=False):
    """Compute the bisimulation relation and convert it to a discrete metric.

    Args:
      tolerance: float, unused.
      verbose: bool, whether to print verbose messages.

    Returns:
      Statistics object containing statistics of computation.
    """
    del tolerance
    equivalence_classes_changing = True
    iteration = 0
    start_time = time.time()
    # All states start in the same equivalence class.
    equivalence_classes = [list(range(self.num_states))]
    state_to_class = [0] * self.num_states
    while equivalence_classes_changing:
      equivalence_classes_changing = False
      class_removed = False
      iteration += 1
      new_equivalence_classes = copy.deepcopy(equivalence_classes)
      new_state_to_class = copy.deepcopy(state_to_class)
      for s1 in range(self.num_states):
        if self._state_matches_class(
            s1, equivalence_classes[state_to_class[s1]]):
          continue
        # We must find a new class for s1.
        equivalence_classes_changing = True
        previous_class = new_state_to_class[s1]
        new_state_to_class[s1] = -1
        # Checking if there are still any elements in s1's old class.
        potential_new_class = [
            x for x in new_equivalence_classes[previous_class] if x != s1]
        if potential_new_class:
          new_equivalence_classes[previous_class] = potential_new_class
        else:
          # remove s1's old class from the list of new_equivalence_classes.
          new_equivalence_classes.pop(previous_class)
          class_removed = True
          # Re-index the classes.
          for i, c in enumerate(new_state_to_class):
            if c > previous_class:
              new_state_to_class[i] = c - 1
        for i, c in enumerate(new_equivalence_classes):
          if not class_removed and i == previous_class:
            continue
          if self._state_matches_class(s1, c):
            new_state_to_class[s1] = i
            new_equivalence_classes[i] += [s1]
            break
        if new_state_to_class[s1] < 0:
          # If we haven't found a matching equivalence class, we create a new
          # one.
          new_equivalence_classes.append([s1])
          new_state_to_class[s1] = len(new_equivalence_classes) - 1
      equivalence_classes = copy.deepcopy(new_equivalence_classes)
      state_to_class = copy.deepcopy(new_state_to_class)
      if iteration % 1000 == 0 and verbose:
        tf.logging.info('Iteration {}'.format(iteration))
    #  Now that we have the equivalence classes, we create the metric.
    self.metric = np.ones((self.num_states, self.num_states))
    for c in equivalence_classes:
      for s1 in c:
        for s2 in c:
          self.metric[s1, s2] = 0.
    total_time = time.time() - start_time
    self.statistics = metric.Statistics(-1., total_time, iteration, 0.0)