def test_tensor(): low = np.array([0., -.5]) high = np.array([1., .5]) rbf = PyTorchGaussianRBF.generate([3, 3], low, high) features = Features(tensor_list=rbf) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.allclose(features(x_i), y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.allclose(features(x_1, x_2), y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.allclose(features(x_i[0], x_i[1]), y[i])
def test_tiles(): tilings = Tiles.generate(3, [3, 3], np.array([0., -.5]), np.array([1., .5])) features = Features(tilings=tilings) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.all(features(x_i) == y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.all(features(x_1, x_2) == y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.all(features(x_i[0], x_i[1]) == y[i])
def test_basis(): low = np.array([0., -.5]) high = np.array([1., .5]) rbf = GaussianRBF.generate([3, 3], high, low) features = Features(basis_list=rbf) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.all(features(x_i) == y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.all(features(x_1, x_2) == y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.all(features(x_i[0], x_i[1]) == y[i])
def test_sac(): tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) agent = StochasticAC(policy, mdp.info, alpha_theta, alpha_v, lambda_par=.5, value_function_features=psi, policy_features=phi) agent.fit(dataset) w_1 = .09370429 w_2 = .28141735 w = agent._V.get_weights() assert np.allclose(w_1, w[65]) assert np.allclose(w_2, w[78])
def experiment(): np.random.seed() # MDP mdp = InvertedPendulum() # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(basis_list=rbfs) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = dict(n_iterations) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = LSPI(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=20) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=20) return np.mean(compute_J(dataset, 1.))
def test_copdac_q(): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 1 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) w = agent.policy.get_weights() w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2]) assert np.allclose(w, w_test)
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=20, n_steps_per_fit=1, render=0) dataset = collect_dataset.get() return np.mean(compute_J(dataset, 1.))
def test_sac_avg(): alpha_r = Parameter(.0001) tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) agent = StochasticAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) agent.fit(dataset) w_1 = .09645764 w_2 = .28583057 w = agent._V.get_weights() assert np.allclose(w_1, w[65]) assert np.allclose(w_2, w[78])
from mushroom.policy import DeterministicPolicy from mushroom.utils.parameters import AdaptiveParameter mdp = ShipSteering() high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution_test = GaussianDiagonalDistribution(mu, sigma) agent_test = RWR(distribution_test, policy, mdp.info, beta=1.) core = Core(agent_test, mdp) s = np.arange(10) a = np.arange(10)
from mushroom.policy import EpsGreedy from mushroom.utils.callbacks import CollectDataset from mushroom.utils.parameters import Parameter # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Q-function approximator n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) # Agent learning_rate = Parameter(.1 / n_tilings) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, agent_params,
def learn(alg): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 2 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings - 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) if alg is StochasticAC: agent = alg(policy, mdp.info, alpha_theta, alpha_v, lambda_par=.5, value_function_features=psi, policy_features=phi) elif alg is StochasticAC_AVG: agent = alg(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return policy