def test_solver_uses_policy_and_data(self): """Test that the solver is passed the data and policy.""" data = [10] initial_policy = Policy(FakeBasis(1)) solver_stub = SolverParamStub(data, initial_policy) lspi.learn(solver_stub.data, solver_stub.policy, solver_stub, max_iterations=1)
def test_epsilon_stopping_condition(self): """Test if learning stops when distance is less than epsilon.""" with self.assertRaises(ValueError): lspi.learn(None, None, None, epsilon=0) epsilon_solver = EpsilonSolverStub(10**-21) lspi.learn(None, Policy(FakeBasis(1)), epsilon_solver, epsilon=10**-20, max_iterations=1000) self.assertEqual(epsilon_solver.num_calls, 1)
def test_max_iterations_stopping_condition(self): """Test if learning stops when max_iterations is reached.""" with self.assertRaises(ValueError): lspi.learn(None, None, None, max_iterations=0) max_iterations_solver = MaxIterationsSolverStub() lspi.learn(None, Policy(FakeBasis(1)), max_iterations_solver, epsilon=10**-200, max_iterations=10) self.assertEqual(max_iterations_solver.num_calls, 10)
def learn_polynomial_basis(self, degree=DEGREE, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None): if initial_policy is None: initial_policy = lspi.Policy(lspi.basis_functions.OneDimensionalPolynomialBasis(degree, 4), discount, explore) learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def learn_node2vec_basis(self, dimension=NUM_BASIS, walk_length=30, num_walks=10, window_size=10, p=1, q=1, epochs=1, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None, edgelist ='node2vec/graph/grid6.edgelist'): if initial_policy is None: initial_policy = lspi.Policy(lspi.basis_functions.Node2vecBasis( edgelist, num_actions=4, transition_probabilities=self.domain.transition_probabilities, dimension=dimension,walk_length=walk_length, num_walks=num_walks, window_size=window_size, p=p, q=q, epochs=epochs), discount, explore) learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver, max_iterations=max_iterations) self.domain.reset() steps_to_goal = 0 absorb = False samples = [] while (not absorb) and (steps_to_goal < max_steps): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) absorb = sample.absorb if absorb: print('Reached the goal in %d', steps_to_goal) steps_to_goal += 1 samples.append(sample) return steps_to_goal, learned_policy, samples, distances
def chain_walk(n_samples): domain = ChainWalkDomain( num_states=4, reward_location=ChainWalkDomain.RewardLocation.Middle) samples = [] init_action = np.random.randint(domain.num_actions) init_sample = domain.apply_action(init_action) samples.append(init_sample) for i in range(1, n_samples): a = samples[-1].action samples.append(domain.apply_action(a)) # basis = FakeBasis(2) poly_basis = OneDimensionalPolynomialBasis(3, 2) # policy = Policy(basis) policy = Policy(poly_basis) policy.weights print('initial policy weights:', policy.weights) solver = LSTDQSolver() learned_policy = learn(samples, policy, solver) print('final policy weights:', learned_policy.weights) return learned_policy
def test_chain_polynomial_basis(self): initial_policy = lspi.Policy( lspi.basis_functions.OneDimensionalPolynomialBasis(3, 2), .9, 0) learned_policy = lspi.learn(self.samples, initial_policy, self.solver) self.domain.reset() cumulative_reward = 0 for i in range(1000): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) cumulative_reward += sample.reward self.assertGreater(cumulative_reward, self.random_policy_cum_rewards)
def test_returns_policy_with_new_weights(self): """Test if the weights in the new policy differ and are not the same underlying numpy vector.""" initial_policy = Policy(FakeBasis(1)) weight_solver = WeightSolverStub(initial_policy.weights) new_policy = lspi.learn(None, initial_policy, weight_solver, max_iterations=1) self.assertEqual(weight_solver.num_calls, 1) self.assertFalse( np.may_share_memory(initial_policy.weights, new_policy)) self.assertNotEquals(id(initial_policy), id(new_policy)) np.testing.assert_array_almost_equal(new_policy.weights, weight_solver.weights)
def test_chain_rbf_basis(self): initial_policy = lspi.Policy( lspi.basis_functions.RadialBasisFunction( np.array([[0], [2], [4], [6], [8]]), .5, 2), .9, 0) learned_policy = lspi.learn(self.samples, initial_policy, self.solver) self.domain.reset() cumulative_reward = 0 for i in range(1000): action = learned_policy.select_action(self.domain.current_state()) sample = self.domain.apply_action(action) cumulative_reward += sample.reward self.assertGreater(cumulative_reward, self.random_policy_cum_rewards)
def mdps(domain, n_samples): samples = [] init_action = np.random.randint(domain.num_actions) init_sample = domain.apply_action(init_action) samples.append(init_sample) for i in range(1, n_samples): a = samples[-1].action samples.append(domain.apply_action(a)) # basis = FakeBasis(2) # basis = OneDimensionalPolynomialBasis(3, domain.num_actions) basis = RadialBasisFunction(np.array([np.array([i]) for i in range(4)]), 0.8, domain.num_actions) policy = Policy(basis) print('initial policy weights:', policy.weights) solver = LSTDQSolver() learned_policy = learn(samples, policy, solver) print('final policy weights:', learned_policy.weights) return learned_policy
# Sample(np.array([0]), 0, 1, np.array([0])), # Sample(np.array([1]), 0, -1, np.array([1]), True) # ] precondition_value = .3 initial_policy = Policy(OneDimensionalPolynomialBasis(3,2), .9, 0, tie_breaking_strategy=Policy.TieBreakingStrategy.FirstWins) # initial_policy = Policy(lspi.basis_functions.RadialBasisFunction(np.array([[0], [2], [4], [6], [8]]), .5, 2), .9, 0) sampling_policy = Policy(FakeBasis(2), .9, 1) solver = LSTDQSolver(precondition_value) # weights = solver.solve(data[:-1], initial_policy) domain = ChainDomain() samples = [] for i in range(1000): action = sampling_policy.select_action(domain.current_state()) samples.append(domain.apply_action(action)) learned_policy = lspi.learn(samples, initial_policy, solver) domain.reset() cumulative_reward = 0 for i in range(1000): action = learned_policy.best_action(domain.current_state()) sample = domain.apply_action(action) print action cumulative_reward += sample.reward print cumulative_reward
return sample_data if __name__ == "__main__": quad_domain = QuadcopterDomain() num_actions = quad_domain.num_actions() #print(num_actions) mean_bf = [np.random.uniform(0,1, size = (6,))] #print(mean_bf, '************') basis_func = RadialBasisFunction(mean_bf, 0.5, num_actions) quad_policy = QuadcopterPolicy(basis_func) sample_data = collect_samples(quad_domain, quad_policy) print(sample_data[0]) solver = LSTDQSolver() start = time.clock() new_policy = lspi.learn(sample_data, quad_policy, solver) print('Done!', (time.clock() - start)) with open('weights.pickle', 'wb') as weights_file: pickle.dump(new_policy.weights, weights_file)
episodes = 5 rewards = [] lengths = [] for i in range(episodes): obs = env.reset() # Extracting normalizedHam and normalizedDelta obs = obs[4:6] # Battiti State # obs = obs # Boosted State samples = [] # collect observations of each episode ...[*] done = False c_reward = 0 steps = 0 while not done: act = init_pol.select_action(obs) nobs, r, done, info = env.step(act) nobs = nobs[4:6] # Battiti State # nobs = nobs # Boosted State c_reward += r samples.append(lspi.Sample(obs, act, r, nobs, done)) obs = nobs steps += 1 if steps >= max_steps_per_eps: break rewards.append(c_reward) lengths.append(steps) print('{:>6d}: {:>7.1f}'.format(i, c_reward)) # [*]... to immediately learn from the trajectory of the episode init_pol = lspi.learn(samples, init_pol, solver) env.close()