コード例 #1
0
    def plot_policy_and_value_function_approx(self, folder, sess):
        # feed_dict = {self.orig_net.matrix_sf: self.matrix_sf}
        # s, v = sess.run([self.orig_net.s, self.orig_net.v], feed_dict=feed_dict)
        u, s, v = np.linalg.svd(self.matrix_sf)
        eigenvalues = s
        eigenvectors = v

        epsilon = 0.0001
        options = []
        with sess.as_default(), sess.graph.as_default():
            self.env.define_network(self.local_network)
            self.env.define_session(sess)
            for k in ["poz", "neg"]:
                for i in range(len(eigenvalues)):
                    polIter = PolicyIteration(0.9,
                                              self.env,
                                              augmentActionSet=True)
                    self.env.define_reward_function(
                        eigenvectors[i] if k == "poz" else -eigenvectors[i])
                    V, pi = polIter.solvePolicyIteration()

                    for j in range(len(V)):
                        if V[j] < epsilon:
                            pi[j] = len(self.env.get_action_set())

                    self.plot_value_function(V[0:self.nb_states],
                                             str(i) + '_' + k + "_", folder)
                    self.plot_policy(pi[0:self.nb_states],
                                     str(i) + '_' + k + "_", folder)

                    options.append(pi[0:self.nb_states])
コード例 #2
0
  def plot_policy_and_value_function(self,  eigenvalues, eigenvectors, policy_folder, v_folder):
    epsilon = 0.0001
    with self.sess.as_default(), self.sess.graph.as_default():
      self.env.define_network(self.local_network)
      self.env.define_session(self.sess)
      for k in ["poz", "neg"]:
        for i in range(len(eigenvalues)):
          """Do policy iteration"""
          discount = 0.9
          polIter = PolicyIteration(discount, self.env, augmentActionSet=True)
          """Use the direction of the eigenvector as intrinsic reward for the policy iteration algorithm"""
          self.env.define_reward_function(eigenvectors[i] if k == "poz" else -eigenvectors[i])
          """Get the optimal value function and policy"""
          V, pi = polIter.solvePolicyIteration()

          for j in range(len(V)):
            if V[j] < epsilon:
              pi[j] = len(self.env.get_action_set())

          """Plot them"""
          self.plot_value_function(V[0:self.nb_states], str(i) + '_' + k + "_", v_folder)
          self.plot_policy(pi[0:self.nb_states], str(i) + '_' + k + "_", policy_folder)
コード例 #3
0
    def plot_policy_and_value_function(self, eigenvalues, eigenvectors):
        epsilon = 0.0001
        options = []
        for k in ["poz", "neg"]:
            for i in range(len(eigenvalues)):
                polIter = PolicyIteration(0.9, self.env, augmentActionSet=True)
                self.env.define_reward_function(eigenvectors[i] if k ==
                                                "poz" else -eigenvectors[i])
                V, pi = polIter.solvePolicyIteration()

                # Now I will eliminate any actions that may give us a small improvement.
                # This is where the epsilon parameter is important. If it is not set all
                # it will never be considered, since I set it to a very small value
                for j in range(len(V)):
                    if V[j] < epsilon:
                        pi[j] = len(self.env.get_action_set())

                # if plotGraphs:
                self.plot_value_function(V[0:self.nb_states],
                                         str(i) + '_' + k + "_")
                self.plot_policy(pi[0:self.nb_states], str(i) + '_' + k + "_")

                options.append(pi[0:self.nb_states])