def test_basis_and_tensors(): low = np.array([0., -.5]) high = np.array([1., .5]) basis_rbf = GaussianRBF.generate([3, 3], low, high) tensor_rbf = GaussianRBFTensor.generate([3, 3], low, high) features_1 = Features(tensor_list=tensor_rbf) features_2 = Features(basis_list=basis_rbf) x = np.random.rand(10, 2) + [0., -.5] y_1 = features_1(x) y_2 = features_2(x) assert np.allclose(y_1, y_2)
def test_tensor(): low = np.array([0., -.5]) high = np.array([1., .5]) rbf = GaussianRBFTensor.generate([3, 3], low, high) rbf += RandomFourierBasis.generate(0.1, 6, 2) features = Features(tensor_list=rbf) x = np.random.rand(10, 2) + [0., -.5] y = features(x) assert y.shape == (10, 15) for i, x_i in enumerate(x): assert np.allclose(features(x_i), y[i]) assert np.all(y[:, -1] == 1) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.allclose(features(x_1, x_2), y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.allclose(features(x_i[0], x_i[1]), y[i])
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -17.30427303, 0., -13.54157504, 0., -16.82373134, 0., -10.29613337, 0., -14.79470382, 0., -10.50654665, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -16.62627886, 0., -13.03033079, 0., -15.93237930, 0., -9.72299176, 0., -13.78884631, 0., -9.92157645, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def learn_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) return agent
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0., -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_tiles_voronoi(): tilings_list = [ VoronoiTiles.generate(3, 10, low=np.array([0., -.5]), high=np.array([1., .5])), VoronoiTiles.generate(3, 10, mu=np.array([.5, -.5]), sigma=np.array([.2, .6])) ] for tilings in tilings_list: features = Features(tilings=tilings) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.all(features(x_i) == y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.all(features(x_1, x_2) == y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.all(features(x_i[0], x_i[1]) == y[i]) assert features.size == y[i].size
def test_sarsa_lambda_continuous_nn(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884, -0.12784956, -0.10572472, -0.14546978, -0.67001086, -0.93925357]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_true_online_sarsa_lambda_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def test_sarsa_lambda_continuous_nn_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def test_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) w = agent.approximator.get_weights() w_test = np.array([-1.00749128, -1.13444655, -0.96620322]) assert np.allclose(w, w_test)
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0., -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(COPDAC_Q.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def test_random_fourier(): np.random.seed(1) tensor_list = RandomFourierBasis.generate(nu=2.5, n_output=10, input_size=2) x = np.array([0.1, 1.4]) features = Features(tensor_list=tensor_list) res = np.array([0.33279073, 0.84292346, 0.03078904, -0.98234737, 0.8367746, 0.476112, 0.4179958, 0.99205977, 0.5216869, 1.]) assert np.allclose(features(x), res) assert features.size == res.size
def __init__(self, tilings, weights=None, output_shape=(1, ), **kwargs): """ Constructor. Args: tilings (list): list of tilings to discretize the input space. weights (np.ndarray): array of weights to initialize the weights of the approximator; input_shape (np.ndarray, None): the shape of the input of the model; output_shape (np.ndarray, (1,)): the shape of the output of the model; **kwargs: other params of the approximator. """ self._phi = Features(tilings=tilings) self._n = len(tilings) super().__init__(weights=weights, input_shape=(self._phi.size, ), output_shape=output_shape) self._add_save_attr(_phi='pickle', _n='primitive')
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))
def test_copdac_q(): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 1 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) w = agent.policy.get_weights() w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2]) assert np.allclose(w, w_test)
def test_fourier(): low = np.array([-1.0, 0.5]) high = np.array([1.0, 2.5]) basis_list = FourierBasis.generate(low, high, 5) features = Features(basis_list=basis_list) x = np.array([0.1, 1.4]) res = np.array([ 1., -0.15643447, -0.95105652, 0.4539905, 0.80901699, -0.70710678, 0.15643447, -1., 0.15643447, 0.95105652, -0.4539905, -0.80901699, -0.95105652, -0.15643447, 1., -0.15643447, -0.95105652, 0.4539905, -0.4539905, 0.95105652, 0.15643447, -1., 0.15643447, 0.95105652, 0.80901699, 0.4539905, -0.95105652, -0.15643447, 1., -0.15643447, 0.70710678, -0.80901699, -0.4539905, 0.95105652, 0.15643447, -1. ]) assert np.allclose(features(x), res)
def test_tiles(): tilings = Tiles.generate(3, [3, 3], np.array([0., -.5]), np.array([1., .5])) features = Features(tilings=tilings) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.all(features(x_i) == y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.all(features(x_1, x_2) == y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.all(features(x_i[0], x_i[1]) == y[i])
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(mdp.info, pi, approximator_params=approximator_params, features=features, **algorithm_params) #shape = agent.approximator.Q #print(agent.Q) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_steps_per_fit=1, render=True) dataset = core.evaluate(n_episodes=1, render=False) #print(dataset) print(episodes_length(dataset)) return np.mean(compute_J(dataset, .96))
def test_tensor(): low = np.array([0., -.5]) high = np.array([1., .5]) rbf = PyTorchGaussianRBF.generate([3, 3], low, high) features = Features(tensor_list=rbf) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.allclose(features(x_i), y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.allclose(features(x_1, x_2), y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.allclose(features(x_i[0], x_i[1]), y[i])
def test_basis(): low = np.array([0., -.5]) high = np.array([1., .5]) rbf = GaussianRBF.generate([3, 3], high, low) features = Features(basis_list=rbf) x = np.random.rand(10, 2) + [0., -.5] y = features(x) for i, x_i in enumerate(x): assert np.all(features(x_i) == y[i]) x_1 = x[:, 0].reshape(-1, 1) x_2 = x[:, 1].reshape(-1, 1) assert np.all(features(x_1, x_2) == y) for i, x_i in enumerate(zip(x_1, x_2)): assert np.all(features(x_i[0], x_i[1]) == y[i])
def test_sarsa_lambda_continuous_linear_save(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_path = './agentdir{}/'.format(datetime.now().strftime("%H%M%S%f")) agent_save.save(agent_path) agent_load = Agent.load(agent_path) shutil.rmtree(agent_path) for att, method in agent_save.__dict__.items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) #print('{}: {}'.format(att, type(save_attr))) tu.assert_eq(save_attr, load_attr)
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings-1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate(1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = StochasticAC_AVG(mdp.info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps/n_episodes)) print('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
class CMAC(LinearApproximator): """ This class implements a Cerebellar Model Arithmetic Computer. """ def __init__(self, tilings, weights=None, output_shape=(1, ), **kwargs): """ Constructor. Args: tilings (list): list of tilings to discretize the input space. weights (np.ndarray): array of weights to initialize the weights of the approximator; input_shape (np.ndarray, None): the shape of the input of the model; output_shape (np.ndarray, (1,)): the shape of the output of the model; **kwargs: other params of the approximator. """ self._phi = Features(tilings=tilings) self._n = len(tilings) super().__init__(weights=weights, input_shape=(self._phi.size, ), output_shape=output_shape) self._add_save_attr(_phi='pickle', _n='primitive') def fit(self, x, y, alpha=1.0, **kwargs): """ Fit the model. Args: x (np.ndarray): input; y (np.ndarray): target; alpha (float): learning rate; **kwargs: other parameters used by the fit method of the regressor. """ y_hat = self.predict(x) delta_y = np.atleast_2d(y - y_hat) if self._w.shape[0] > 1: delta_y = delta_y.T phi = np.atleast_2d(self._phi(x)) sum_phi = np.sum(phi, axis=0) n = np.sum(phi, axis=1, keepdims=True) phi_n = phi / n sum_phi[sum_phi == 0] = 1. delta_w = delta_y @ phi_n / sum_phi self._w += alpha * delta_w def predict(self, x, **predict_params): """ Predict. Args: x (np.ndarray): input; **predict_params: other parameters used by the predict method the regressor. Returns: The predictions of the model. """ prediction = np.ones((x.shape[0], self._w.shape[0])) indexes = self._phi.compute_indexes(x) if x.shape[0] == 1: indexes = list([indexes]) for i, idx in enumerate(indexes): prediction[i] = np.sum(self._w[:, idx], axis=-1) return prediction.squeeze() def diff(self, state, action=None): """ Compute the derivative of the output w.r.t. ``state``, and ``action`` if provided. Args: state (np.ndarray): the state; action (np.ndarray, None): the action. Returns: The derivative of the output w.r.t. ``state``, and ``action`` if provided. """ phi = self._phi(state) return super().diff(phi, action)
from mushroom_rl.policy import EpsGreedy from mushroom_rl.utils.callbacks import CollectDataset from mushroom_rl.utils.parameters import Parameter # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Q-function approximator n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) # Agent learning_rate = Parameter(.1 / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = SARSALambdaContinuous(mdp.info, pi, LinearApproximator, approximator_params=approximator_params, learning_rate=learning_rate, lambda_coeff=.9, features=features) # Algorithm
def learn(alg): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 2 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings - 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) if alg is StochasticAC: agent = alg(mdp.info, policy, alpha_theta, alpha_v, lambda_par=.5, value_function_features=psi, policy_features=phi) elif alg is StochasticAC_AVG: agent = alg(mdp.info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return agent