def test_deterministic_policy(): np.random.seed(88) n_dims = 5 approximator = Regressor(LinearApproximator, input_shape=(n_dims,), output_shape=(2,)) pi = DeterministicPolicy(approximator) w_new = np.random.rand(pi.weights_size) w_old = pi.get_weights() pi.set_weights(w_new) assert np.array_equal(w_new, approximator.get_weights()) assert not np.array_equal(w_old, w_new) assert np.array_equal(w_new, pi.get_weights()) s_test_1 = np.random.randn(5) s_test_2 = np.random.randn(5) a_test = approximator.predict(s_test_1) assert pi.get_regressor() == approximator assert pi(s_test_1, a_test) == 1 assert pi(s_test_2, a_test) == 0 a_stored = np.array([-1.86941072, -0.1789696]) assert np.allclose(pi.draw_action(s_test_1), a_stored)
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J))
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) logger.info('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, dist, policy, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), distribution_parameters=distribution.get_parameters()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info( i + 1, J=np.mean(J), distribution_parameters=distribution.get_parameters())
def learn(alg, **alg_params): np.random.seed(1) torch.manual_seed(1) # MDP mdp = LQR.generate(dimensions=2) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, distribution, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=5, n_episodes_per_fit=5) return agent