def learn_polynomial_basis(self, degree=DEGREE, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, run_simulation=False): if initial_policy is None: initial_policy = policy.Policy(basis_functions.OneDimensionalPolynomialBasis(degree, 4), discount, explore) learned_policy, distances = lspi.learn(self.lspi_samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def learn_proto_values_basis(self, num_basis=NUM_BASIS, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, rpi_epochs=1, run_simulation=False): if initial_policy is None: initial_policy = policy.Policy(basis_functions.ProtoValueBasis( self.domain.learn_graph(self.samples), 4, num_basis), self.discount, explore) learned_policy, distances = lspi.learn(self.lspi_samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def learn_gcn_basis(self, graph_edgelist, dimension, walk_length=30, num_walks=10, time_pts_range=[0, 25], taus='auto', max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, nb_filters=1, initial_policy=None, discount=DISCOUNT, explore=EXPLORE, run_simulation=False, model_str='gcn_vae',): # graph = self.domain.learn_graph(sample_length=walk_length, num_samples=num_walks, # sampling_policy=self.sampling_policy) # # self.domain.write_edgelist(graph_edgelist, graph) if initial_policy is None: initial_policy = policy.Policy(basis_functions.GCNBasis(graph_edgelist, num_actions=4, dimension=dimension, model_str='gcn_vae',), discount, explore) learned_policy, distances = lspi.learn(self.lspi_samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def learn_struc2vec_basis(self, dimension=30, walk_length=100, num_walks=50, window_size=10, epochs=1, edgelist='node2vec/graph/tworooms.edgelist', max_iterations=MAX_ITERATIONS, discount=DISCOUNT, explore=EXPLORE, max_steps=NUM_SAMPLES, initial_policy=None, run_simulation=False): if initial_policy is None: initial_policy = policy.Policy(basis_functions.Struc2vecBasis(graph_edgelist=edgelist, num_actions=4, dimension=dimension, walk_length=walk_length, num_walks=num_walks, window_size=window_size, epochs=epochs) , discount, explore) learned_policy, distances = lspi.learn(self.lspi_samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def learn_discounted_node2vec_basis(self, dimension=NUM_BASIS, walk_length=30, num_walks=10, window_size=10, gamma=0.6, p=1, q=1, epochs=1, learning_rate=0.5, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, edgelist ='node2vec/graph/NA.edgelist', run_simulation=False, lspi_epochs=1): if initial_policy is None: initial_policy = policy.Policy(basis_functions.DiscountedNode2vecBasis( edgelist, num_actions=4, transition_probabilities=self.domain.transition_probabilities, discount=self.discount, dimension=dimension, walks=self.walks, walk_length=walk_length, num_walks=num_walks, window_size=window_size, p=p, q=q, epochs=epochs, learning_rate=learning_rate), gamma, explore) self.sampling_policy = initial_policy for i in range(lspi_epochs): learned_policy, distances = lspi.learn(self.lspi_samples, self.sampling_policy, self.solver, max_iterations=max_iterations) self.sampling_policy = learned_policy self.sampling_policy.explore *= EPSILON_DECAY self.compute_samples(True) # self.sampling_policy.explore = 1. self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def run_lspi(self, basis, discount, max_iter, explore): basis_policy = policy.Policy(basis, discount, explore) learned_policy, distances, iterations = lspi.learn(self.lspi_samples, basis_policy, self.solver, max_iterations=max_iter) return learned_policy, distances, iterations