def learn_proto_values_basis(self, num_basis=NUM_BASIS, walk_length=30, num_walks=10, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, rpi_epochs=1, run_simulation=False): if initial_policy is None: initial_policy = policy.Policy( basis_functions.ProtoValueBasis( self.domain.learn_graph(walk_length, num_walks, self.sampling_policy), 4, num_basis), discount, explore) learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action( self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def compute_samples(self, reset_samples=True, reset_policy=True, biased_walk=False): if reset_policy: self.sampling_policy = policy.Policy(basis_functions.FakeBasis(4), self.discount, 1) if reset_samples: self.samples = [] self.lspi_samples = [] self.walks = [] self.actions = [] for i in range(self.num_samples): if biased_walk: sample, walk, terminated, lspi_sample = self.domain.generate_unique_samples(self.length_sample, self.sampling_policy) else: sample, walk, walk_actions, terminated, lspi_sample = self.domain.generate_samples(self.length_sample, self.sampling_policy) self.samples.extend(sample) self.walks.append(walk) self.actions.append(walk_actions) # if terminated: # and len(self.lspi_samples) <= NUM_SAMPLES: self.lspi_samples.extend(lspi_sample)
def learn_polynomial_basis(self, degree=DEGREE, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, run_simulation=False): if initial_policy is None: initial_policy = policy.Policy( basis_functions.OneDimensionalPolynomialBasis(degree, 4), discount, explore) learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action( self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def learn_discounted_node2vec_basis(self, dimension=NUM_BASIS, walk_length=30, num_walks=10, window_size=10, gamma=0.6, p=1, q=1, epochs=1, learning_rate=0.5, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, edgelist ='node2vec/graph/NA.edgelist', run_simulation=False, lspi_epochs=1): if initial_policy is None: initial_policy = policy.Policy(basis_functions.DiscountedNode2vecBasis( edgelist, num_actions=4, transition_probabilities=self.domain.transition_probabilities, discount=self.discount, dimension=dimension, walks=self.walks, walk_length=walk_length, num_walks=num_walks, window_size=window_size, p=p, q=q, epochs=epochs, learning_rate=learning_rate), gamma, explore) self.sampling_policy = initial_policy for i in range(lspi_epochs): learned_policy, distances = lspi.learn(self.lspi_samples, self.sampling_policy, self.solver, max_iterations=max_iterations) self.sampling_policy = learned_policy self.sampling_policy.explore *= EPSILON_DECAY self.compute_samples(True) # self.sampling_policy.explore = 1. self.domain.reset() steps_to_goal = 0 absorb = False samples = [] if run_simulation: while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def run_lspi(self, basis, discount, max_iter, explore): basis_policy = policy.Policy(basis, discount, explore) learned_policy, distances, iterations = lspi.learn(self.lspi_samples, basis_policy, self.solver, max_iterations=max_iter) return learned_policy, distances, iterations