def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J start:', J) # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J final:', J)
def flat_experiment(mdp, agent, n_epochs, n_iterations, ep_per_iteration, ep_per_eval): np.random.seed() J_list = list() L_list = list() core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_iteration, n_episodes_per_fit=ep_per_iteration, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) #print('J', n, ':', J_list[-1]) return J_list, L_list
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, n_runs, n_iterations, ep_per_run, use_tensorflow): np.random.seed() # MDP mdp = ShipSteering() # Policy if use_tensorflow: tensor_list = gaussian_tensor.generate( [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(tensor_list=tensor_list, name='phi', input_dim=mdp.info.observation_space.shape[0]) else: basis = GaussianRBF.generate([3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(basis_list=basis) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = np.array([[.05]]) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def experiment(alg, params, experiment_params ,subdir, i): np.random.seed() # MDP mdp = ShipSteering(small=True, n_steps_action=3) high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) #sigma = np.array([[1e-4]]) std = np.array([3e-2]) policy = DiagonalGaussianPolicy(mu=approximator, std=std) #policy = GaussianPolicy(mu=approximator, sigma=sigma) # Agent agent = alg(policy, mdp.info, features=phi, **params) # Train parameter_dataset = CollectPolicyParameter(policy) core = Core(agent, mdp, callbacks=[parameter_dataset]) dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION :', n) core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run mk_dir_recursive('./' + subdir + str(i)) np.save(subdir+str(i)+'/dataset_eval_file', dataset_eval) np.save(subdir+str(i)+'/parameter_dataset_file', parameter_dataset)
def experiment(alg, params, subdir, exp_no): np.random.seed() # MDP mdp = ShipSteering(small=True, n_steps_action=3) high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator_params = dict(input_dim=input_shape) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train dataset_eval = list() core = Core(agent, mdp) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) #print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run mk_dir_recursive('./' + subdir + str(exp_no)) np.save(subdir + str(exp_no) + '/dataset_eval_file', dataset_eval)
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(boosted): np.random.seed(20) # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Approximator if not boosted: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) else: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_models=3, prediction='sum', n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = FQI(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True) # Test test_epsilon = Parameter(0) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((9, 2)) cont = 0 for i in range(-8, 9, 8): for j in range(-8, 9, 8): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(approximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(n_epochs, n_steps, n_eval_episodes): np.random.seed() # MDP mdp = InvertedPendulum() # Agent n_tilings = 10 alpha_theta = ExponentialDecayParameter(1, decay_exp=1.0) alpha_omega = ExponentialDecayParameter(1.5 / n_tilings, decay_exp=2 / 3) alpha_v = ExponentialDecayParameter(1 / n_tilings, decay_exp=2 / 3) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-3 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=n_eval_episodes) J = compute_J(dataset_eval, gamma=1.0) print('Total Reward per episode at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset_eval = core.evaluate(n_episodes=n_eval_episodes, render=False) J = compute_J(dataset_eval, gamma=1.0) print('Total Reward per episode at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=fit_params, approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(n_epochs, ep_per_epoch_train, ep_per_epoch_eval, n_iterations): np.random.seed() # MDP mdp = PreyPredator() basis = PolynomialBasis.generate(1, mdp.info.observation_space.shape[0]) phi = Features(basis_list=basis[1:]) # Features approximator = Regressor(LinearApproximator, input_shape=(phi.size, ), output_shape=mdp.info.action_space.shape) sigma = 1e-2 * np.eye(mdp.info.action_space.shape[0]) policy = GaussianPolicy(approximator, sigma) lr = Parameter(1e-5) #agent = GPOMDP(policy, mdp.info, lr, phi) agent = KeyboardAgent() # Train core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) print('Reward at start: ', np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=ep_per_epoch_train, n_episodes_per_fit=ep_per_epoch_train // n_iterations, render=False) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) p = policy.get_weights() print('mu: ', p) print('Reward at iteration ', i, ': ', np.mean(J)) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_episodes_test, alg_params, policy_params): print(alg.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, critic_params, **alg_params) core = Core(agent, mdp) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH ' + str(it)) tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) print('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def test_dataset_utils(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) epsilon = Parameter(value=0.) alpha = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) agent = SARSA(pi, mdp.info, alpha) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=10) J = compute_J(dataset, mdp.info.gamma) J_test = np.array([ 1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01, 1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00, 2.28767925e+00, 4.23911583e-01 ]) assert np.allclose(J, J_test) L = episodes_length(dataset) L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31]) assert np.array_equal(L, L_test) dataset_ep = select_first_episodes(dataset, 3) J = compute_J(dataset_ep, mdp.info.gamma) assert np.allclose(J, J_test[:3]) L = episodes_length(dataset_ep) assert np.allclose(L, L_test[:3]) samples = select_random_samples(dataset, 2) s, a, r, ss, ab, last = parse_dataset(samples) s_test = np.array([[6.], [1.]]) a_test = np.array([[0.], [1.]]) r_test = np.zeros(2) ss_test = np.array([[3], [4]]) ab_test = np.zeros(2) last_test = np.zeros(2) assert np.array_equal(s, s_test) assert np.array_equal(a, a_test) assert np.array_equal(r, r_test) assert np.array_equal(ss, ss_test) assert np.array_equal(ab, ab_test) assert np.array_equal(last, last_test) index = np.sum(L_test[:2]) + L_test[2] // 2 min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma) assert min_J == 0.0 assert max_J == 0.0011610630703530948 assert mean_J == 0.0005805315351765474 assert n_episodes == 2
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(policy, mdp.info, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, n_runs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = .1 * np.eye(1) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(dist, policy, mdp.info, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(n_epochs, n_iteration, n_ep_per_fit, n_eval_run): np.random.seed() # MDP mdp = SegwayLinearMotion() input_dim = mdp.info.observation_space.shape[0] mu = np.zeros(input_dim) sigma = 2e-0 * np.ones(input_dim) policy = SegwayControlPolicy(mu) dist = GaussianDiagonalDistribution(mu, sigma) beta = 2e-3 agent = RWR(dist, policy, mdp.info, beta) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=n_eval_run, render=False) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start ', np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=n_iteration * n_ep_per_fit, n_episodes_per_fit=n_ep_per_fit, render=False) dataset_eval = core.evaluate(n_episodes=n_eval_run, render=False) J = compute_J(dataset_eval, gamma=mdp.info.gamma) p = dist.get_parameters() print('mu: ', p[:input_dim]) print('sigma: ', p[input_dim:]) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alpha): gym.logger.setLevel(0) np.random.seed(386) # MDP mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.) mdp.seed(201) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = {'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.array([[0., 0.], [.1, .1]]) dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, 1.))
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() use_cuda = torch.cuda.is_available() # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v0', horizon, gamma) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=use_cuda) # Agent agent = alg(mdp.info, policy_class, policy_params, batch_size, initial_replay_size, max_replay_size, tau, critic_params, actor_params, actor_optimizer) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma) print('J: ', np.mean(J)) print('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of learning step before each update of' 'the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of learning steps.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length. This number is 30 in the DQN' 'Deepmind paper, but they consider the first 30' 'frame without frame skipping.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, name='test', load_path=args.load_path, optimizer={'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon} ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, folder_name=folder_name, optimizer={'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon} ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v0', horizon, gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=use_cuda) # Agent agent = alg(mdp.info, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, critic_fit_params=None) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma) print('J: ', np.mean(J)) print('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)
# Q-function approximator n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) # Agent learning_rate = Parameter(.1 / n_tilings) agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, approximator_params=approximator_params, learning_rate=learning_rate, lambda_coeff= .9, features=features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=100, n_steps_per_fit=1) # Evaluate core.evaluate(n_episodes=1, render=True)
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings-1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate(1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = SAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps/n_episodes)) print('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) # Agent learning_rate = Parameter(.1 / n_tilings) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, agent_params, features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=100, n_steps_per_fit=1) # Evaluate core.evaluate(n_episodes=1, render=True)
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of collected samples.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of collected samples for each' 'evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op actions performed at the' 'beginning of the episodes. The minimum number is' 'history_length. This number is reported to be 30' 'in the DQN Deepmind paper but, since they' 'consider the first 30 frames without frame' 'skipping and that the number of skipped frames' 'is generally 4, we set it to 8.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--device', type=int, default=None, help='ID of the GPU device to use. If None, CPU is' 'used.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, load_path=args.load_path, optimizer=optimizer, loss=F.smooth_l1_loss, device=args.device ) approximator = PyTorchApproximator # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, folder_name=folder_name, optimizer=optimizer, loss=F.smooth_l1_loss, device=args.device ) approximator = PyTorchApproximator # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, target_update_frequency=target_update_frequency//train_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(policy, name, alg_version): np.random.seed() # MDP if name == "Taxi": mdp = generate_taxi('../grid.txt') max_steps = 100000 evaluation_frequency = 2000 test_samples = 10000 elif name == "NChain-v0": mdp = generate_chain(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "Loop": mdp = generate_loop(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "SixArms": mdp = generate_arms(horizon=1000) max_steps = 25000 evaluation_frequency = 500 test_samples = 10000 elif name == "RiverSwim": mdp = generate_river(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 else: raise NotImplementedError # Policy # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5, # size=mdp.info.observation_space.size) epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) epsilon_test = Parameter(0) pi = policy(epsilon=epsilon_train) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=.2, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) scores = list() scores_train = list() # Train for n_epoch in range(1, max_steps // evaluation_frequency + 1): print('- Learning:') # learning step pi.set_epsilon(epsilon_train) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=False) dataset = collect_dataset.get() if name == "Taxi": scores_train.append(get_stats(dataset)) elif name in ["SixArms"]: scores_train.append(compute_scores_Loop(dataset, horizon=500)) else: scores_train.append(compute_scores_Loop(dataset)) collect_dataset.clean() mdp.reset() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=test_samples, quiet=False) mdp.reset() scores.append(get_stats(dataset)) #np.save(env + '/'+alg_version+'_scores.npy', scores) return scores_train, scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, double, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn( 'QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter( value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn( 'Bootstrapped QL available with only boot and weighted policies!' ) policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=q_max - q_min, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'vpi']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() collect_qs_callback = CollectQs(agent.approximator) callbacks = [collect_dataset] if collect_qs: callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, target_update_frequency=target_update_frequency) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = (1,) + mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=1, target_update_frequency=target_update_frequency, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
params=approximator_params) sigma = np.eye(2) * 1e-1 policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=5) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = REINFORCE(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) print 'Initial evaluation' dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): print 'iteration', i print 'learn' core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) print 'evaluate' dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) mdp.stop()
from mushroom.utils.dataset import compute_J from mushroom.utils.parameters import Parameter mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = FQI(approximator, pi, mdp.info, n_iterations=20, approximator_params=approximator_params) core = Core(agent, mdp) core.learn(n_episodes=1000, n_episodes_per_fit=1000) pi.set_epsilon(Parameter(0.)) initial_state = np.array([[-.5, 0.]]) dataset = core.evaluate(initial_states=initial_state) print(compute_J(dataset, gamma=mdp.info.gamma))