def plot_policy_and_value_function_approx(self, folder, sess): # feed_dict = {self.orig_net.matrix_sf: self.matrix_sf} # s, v = sess.run([self.orig_net.s, self.orig_net.v], feed_dict=feed_dict) u, s, v = np.linalg.svd(self.matrix_sf) eigenvalues = s eigenvectors = v epsilon = 0.0001 options = [] with sess.as_default(), sess.graph.as_default(): self.env.define_network(self.local_network) self.env.define_session(sess) for k in ["poz", "neg"]: for i in range(len(eigenvalues)): polIter = PolicyIteration(0.9, self.env, augmentActionSet=True) self.env.define_reward_function( eigenvectors[i] if k == "poz" else -eigenvectors[i]) V, pi = polIter.solvePolicyIteration() for j in range(len(V)): if V[j] < epsilon: pi[j] = len(self.env.get_action_set()) self.plot_value_function(V[0:self.nb_states], str(i) + '_' + k + "_", folder) self.plot_policy(pi[0:self.nb_states], str(i) + '_' + k + "_", folder) options.append(pi[0:self.nb_states])
def plot_policy_and_value_function(self, eigenvalues, eigenvectors, policy_folder, v_folder): epsilon = 0.0001 with self.sess.as_default(), self.sess.graph.as_default(): self.env.define_network(self.local_network) self.env.define_session(self.sess) for k in ["poz", "neg"]: for i in range(len(eigenvalues)): """Do policy iteration""" discount = 0.9 polIter = PolicyIteration(discount, self.env, augmentActionSet=True) """Use the direction of the eigenvector as intrinsic reward for the policy iteration algorithm""" self.env.define_reward_function(eigenvectors[i] if k == "poz" else -eigenvectors[i]) """Get the optimal value function and policy""" V, pi = polIter.solvePolicyIteration() for j in range(len(V)): if V[j] < epsilon: pi[j] = len(self.env.get_action_set()) """Plot them""" self.plot_value_function(V[0:self.nb_states], str(i) + '_' + k + "_", v_folder) self.plot_policy(pi[0:self.nb_states], str(i) + '_' + k + "_", policy_folder)
def plot_policy_and_value_function(self, eigenvalues, eigenvectors): epsilon = 0.0001 options = [] for k in ["poz", "neg"]: for i in range(len(eigenvalues)): polIter = PolicyIteration(0.9, self.env, augmentActionSet=True) self.env.define_reward_function(eigenvectors[i] if k == "poz" else -eigenvectors[i]) V, pi = polIter.solvePolicyIteration() # Now I will eliminate any actions that may give us a small improvement. # This is where the epsilon parameter is important. If it is not set all # it will never be considered, since I set it to a very small value for j in range(len(V)): if V[j] < epsilon: pi[j] = len(self.env.get_action_set()) # if plotGraphs: self.plot_value_function(V[0:self.nb_states], str(i) + '_' + k + "_") self.plot_policy(pi[0:self.nb_states], str(i) + '_' + k + "_") options.append(pi[0:self.nb_states])