def learn(alg, alg_params): mdp = LQR.generate(dimensions=1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return policy
def test_lqr(): np.random.seed(1) mdp = LQR.generate(2) mdp.reset() for i in range(10): ns, r, ab, _ = mdp.step(np.random.rand(mdp.info.action_space.shape[0])) ns_test = np.array([12.35564605, 14.98996889]) assert np.allclose(ns, ns_test) A = np.eye(3) B = np.array([[2 / 3, 0], [1 / 3, 1 / 3], [0, 2 / 3]]) Q = np.array([[0.1, 0., 0.], [0., 0.9, 0.], [0., 0., 0.1]]) R = np.array([[0.1, 0.], [0., 0.9]]) mdp = LQR(A, B, Q, R, max_pos=11.0, max_action=0.5, episodic=True) mdp.reset() a_test = np.array([1.0, 0.3]) ns, r, ab, _ = mdp.step(a_test) ns_test = np.array([10.23333333, 10.16666667, 10.1]) assert np.allclose(ns, ns_test) and np.allclose(r, -107.917) and not ab a_test = np.array([0.4, -0.1]) ns, r, ab, _ = mdp.step(a_test) ns_test = np.array([10.5, 10.26666667, 10.03333333]) assert np.allclose(ns, ns_test) and np.allclose( r, -113.72311111111117) and not ab a_test = np.array([0.5, 0.6]) ns, r, ab, _ = mdp.step(a_test) ns_test = np.array([10.83333333, 10.6, 10.36666667]) assert np.allclose(ns, ns_test) and np.allclose( r, -116.20577777777778) and not ab a_test = np.array([0.3, -0.7]) ns, r, ab, _ = mdp.step(a_test) ns_test = np.array([11.03333333, 10.53333333, 10.03333333]) assert np.allclose(ns, ns_test) and np.allclose(r, -1210.0) and ab