def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def flat_experiment(mdp, agent, n_epochs, n_iterations, ep_per_iteration, ep_per_eval): np.random.seed() J_list = list() L_list = list() core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_iteration, n_episodes_per_fit=ep_per_iteration, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) #print('J', n, ':', J_list[-1]) return J_list, L_list
def ghavamzadeh_experiment(mdp, agent_plus, agent_cross, agent_high, n_epochs, n_episodes, ep_per_eval, ep_per_iteration_low): np.random.seed() computational_graph, control_blockH = build_ghavamzadeh_graph( mdp, agent_plus, agent_cross, agent_high, ep_per_iteration_low) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() epsilon_update = EpsilonUpdate(agent_high.policy) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if n == 4: control_blockH.callbacks = [epsilon_update] return J_list, L_list
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J start:', J) # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J final:', J)
def discretized_experiment(mdp, agent, n_actions, n_epochs, n_episodes, ep_per_eval, display, print_j, quiet): np.random.seed() computational_graph = build_computational_graph_discretized( mdp, agent, n_actions) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at start :', J_list[-1]) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=quiet) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at epoch ', n, ':', J_list[-1]) if display: core.evaluate(n_episodes=1, render=True) return J_list, L_list
def hierarchical_experiment(mdp, agent_low, agent_high, n_epochs, n_episodes, ep_per_fit_low, ep_per_fit_high, ep_per_eval): np.random.seed() computational_graph, control_block_h = build_computational_graph( mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): if n == 2: control_block_h.unset_mask() core.learn(n_episodes=n_episodes, skip=True, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) return J_list, L_list
def two_level_ghavamzade_hierarchical_experiment( mdp, agent_l, agent_h, n_epochs, n_iterations, ep_per_epoch_train, ep_per_epoch_eval, ep_per_fit_low, ep_per_fit_high): np.random.seed() computational_graph, control_block_h = build_computational_graph( mdp, agent_l, agent_h, ep_per_fit_low, ep_per_fit_high) core = HierarchicalCore(computational_graph) J_list = list() dataset = core.evaluate(n_episodes=ep_per_epoch_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) print('J at start: ', np.mean(J)) print('Mean gates passed: ', count_gates(dataset)) for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_epoch_train, skip=True, quiet=False) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, quiet=True, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) print('J at iteration ', n, ': ', np.mean(J)) print('Mean gates passed: ', count_gates(dataset)) return J_list
def experiment(alg, params, experiment_params ,subdir, i): np.random.seed() # MDP mdp = ShipSteering(small=True, n_steps_action=3) high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) #sigma = np.array([[1e-4]]) std = np.array([3e-2]) policy = DiagonalGaussianPolicy(mu=approximator, std=std) #policy = GaussianPolicy(mu=approximator, sigma=sigma) # Agent agent = alg(policy, mdp.info, features=phi, **params) # Train parameter_dataset = CollectPolicyParameter(policy) core = Core(agent, mdp, callbacks=[parameter_dataset]) dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION :', n) core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run mk_dir_recursive('./' + subdir + str(i)) np.save(subdir+str(i)+'/dataset_eval_file', dataset_eval) np.save(subdir+str(i)+'/parameter_dataset_file', parameter_dataset)
def experiment(alg, n_runs, n_iterations, ep_per_run, use_tensorflow): np.random.seed() # MDP mdp = ShipSteering() # Policy if use_tensorflow: tensor_list = gaussian_tensor.generate( [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(tensor_list=tensor_list, name='phi', input_dim=mdp.info.observation_space.shape[0]) else: basis = GaussianRBF.generate([3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(basis_list=basis) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = np.array([[.05]]) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def experiment(alg, params, subdir, exp_no): np.random.seed() # MDP mdp = ShipSteering(small=True, n_steps_action=3) high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator_params = dict(input_dim=input_shape) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train dataset_eval = list() core = Core(agent, mdp) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) #print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run mk_dir_recursive('./' + subdir + str(exp_no)) np.save(subdir + str(exp_no) + '/dataset_eval_file', dataset_eval)
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(boosted): np.random.seed(20) # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Approximator if not boosted: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) else: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_models=3, prediction='sum', n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = FQI(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True) # Test test_epsilon = Parameter(0) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((9, 2)) cont = 0 for i in range(-8, 9, 8): for j in range(-8, 9, 8): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(approximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(n_epochs, n_steps, n_eval_episodes): np.random.seed() # MDP mdp = InvertedPendulum() # Agent n_tilings = 10 alpha_theta = ExponentialDecayParameter(1, decay_exp=1.0) alpha_omega = ExponentialDecayParameter(1.5 / n_tilings, decay_exp=2 / 3) alpha_v = ExponentialDecayParameter(1 / n_tilings, decay_exp=2 / 3) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-3 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=n_eval_episodes) J = compute_J(dataset_eval, gamma=1.0) print('Total Reward per episode at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset_eval = core.evaluate(n_episodes=n_eval_episodes, render=False) J = compute_J(dataset_eval, gamma=1.0) print('Total Reward per episode at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval, ep_per_fit_low, display, print_j, quiet): np.random.seed() dataset_callback = CollectDataset() computational_graph = build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, [dataset_callback]) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) J_low_list = list() L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at start :', J_list[-1]) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=quiet) ll_dataset = dataset_callback.get() dataset_callback.clean() J_low = compute_J(ll_dataset, mdp.info.gamma) J_low_list.append(np.mean(J_low)) if print_j: print('Low level reward at epoch', n, ':', np.mean(J_low)) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at epoch ', n, ':', J_list[-1]) if display: core.evaluate(n_episodes=1, render=True) return J_list, L_list, J_low_list
def compute_mean_J(dataset_eval, n_runs, eval_run, gamma): J_runs_eps = compute_J(dataset_eval, gamma) J_avg = np.zeros(n_runs + 1) for i in range(n_runs + 1): J_avg[i] = np.mean(J_runs_eps[eval_run * i:eval_run * i + eval_run], axis=0) return J_avg
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def fit(self, dataset): Jep = compute_J(dataset, self.mdp_info.gamma) Jep = np.array(Jep) theta = np.array(self._theta_list) self._update(Jep, theta) self._theta_list = list()
def experiment(n_epochs, ep_per_epoch_train, ep_per_epoch_eval, n_iterations): np.random.seed() # MDP mdp = PreyPredator() basis = PolynomialBasis.generate(1, mdp.info.observation_space.shape[0]) phi = Features(basis_list=basis[1:]) # Features approximator = Regressor(LinearApproximator, input_shape=(phi.size, ), output_shape=mdp.info.action_space.shape) sigma = 1e-2 * np.eye(mdp.info.action_space.shape[0]) policy = GaussianPolicy(approximator, sigma) lr = Parameter(1e-5) #agent = GPOMDP(policy, mdp.info, lr, phi) agent = KeyboardAgent() # Train core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) print('Reward at start: ', np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=ep_per_epoch_train, n_episodes_per_fit=ep_per_epoch_train // n_iterations, render=False) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) p = policy.get_weights() print('mu: ', p) print('Reward at iteration ', i, ': ', np.mean(J)) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_episodes_test, alg_params, policy_params): print(alg.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, critic_params, **alg_params) core = Core(agent, mdp) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH ' + str(it)) tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) print('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def hierarchical_experiment(mdp, agent_l, agent_m1, agent_m2, agent_m3, agent_m4, agent_h, n_epochs, n_iterations, ep_per_epoch_train, ep_per_epoch_eval, ep_per_fit_low, ep_per_fit_mid): np.random.seed() computational_graph, control_block_h = build_computational_graph( mdp, agent_l, agent_m1, agent_m2, agent_m3, agent_m4, agent_h, ep_per_fit_low, ep_per_fit_mid) core = HierarchicalCore(computational_graph) J_list = list() dataset = core.evaluate(n_episodes=ep_per_epoch_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) print('J at start: ', np.mean(J)) print('Mean gates passed: ', count_gates(dataset)) for n in range(n_epochs): curr_learning_rate = agent_h.alpha agent_h.alpha = Parameter(value=0.0) core.learn(n_episodes=n_iterations * ep_per_epoch_train, skip=True, quiet=False) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, quiet=True, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) print('J at iteration ', n, ': ', np.mean(J)) print('Mean gates passed: ', count_gates(dataset)) print('Policy Parameters M1', agent_m1.policy.get_weights()) print('Policy Parameters M2', agent_m2.policy.get_weights()) print('Policy Parameters M3', agent_m3.policy.get_weights()) print('Policy Parameters M4', agent_m4.policy.get_weights()) agent_h.alpha = curr_learning_rate return J_list
def experiment(alg, n_runs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = .1 * np.eye(1) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(policy, mdp.info, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(n_epochs, n_iteration, n_ep_per_fit, n_eval_run): np.random.seed() # MDP mdp = SegwayLinearMotion() input_dim = mdp.info.observation_space.shape[0] mu = np.zeros(input_dim) sigma = 2e-0 * np.ones(input_dim) policy = SegwayControlPolicy(mu) dist = GaussianDiagonalDistribution(mu, sigma) beta = 2e-3 agent = RWR(dist, policy, mdp.info, beta) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=n_eval_run, render=False) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start ', np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=n_iteration * n_ep_per_fit, n_episodes_per_fit=n_ep_per_fit, render=False) dataset_eval = core.evaluate(n_episodes=n_eval_run, render=False) J = compute_J(dataset_eval, gamma=mdp.info.gamma) p = dist.get_parameters() print('mu: ', p[:input_dim]) print('sigma: ', p[input_dim:]) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean(torch.distributions.kl.kl_divergence( new_pol_dist, old_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}".format( avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------')
def experiment(alpha): gym.logger.setLevel(0) np.random.seed(386) # MDP mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.) mdp.seed(201) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = {'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.array([[0., 0.], [.1, .1]]) dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, 1.))
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(): np.random.seed() # MDP mdp = InvertedPendulum() # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(basis_list=rbfs) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = dict(n_iterations) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = LSPI(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=20) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=20) return np.mean(compute_J(dataset, 1.))
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(dist, policy, mdp.info, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
from mushroom.utils.dataset import compute_J from mushroom.utils.parameters import Parameter mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = FQI(approximator, pi, mdp.info, n_iterations=20, approximator_params=approximator_params) core = Core(agent, mdp) core.learn(n_episodes=1000, n_episodes_per_fit=1000) pi.set_epsilon(Parameter(0.)) initial_state = np.array([[-.5, 0.]]) dataset = core.evaluate(initial_states=initial_state) print(compute_J(dataset, gamma=mdp.info.gamma))
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = (1,) + mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=1, target_update_frequency=target_update_frequency, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings-1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate(1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = SAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps/n_episodes)) print('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)