def flat_experiment(mdp, agent, n_epochs, n_iterations, ep_per_iteration, ep_per_eval): np.random.seed() J_list = list() L_list = list() core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_iteration, n_episodes_per_fit=ep_per_iteration, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) #print('J', n, ':', J_list[-1]) return J_list, L_list
def hierarchical_experiment(mdp, agent_low, agent_high, n_epochs, n_episodes, ep_per_fit_low, ep_per_fit_high, ep_per_eval): np.random.seed() computational_graph, control_block_h = build_computational_graph( mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): if n == 2: control_block_h.unset_mask() core.learn(n_episodes=n_episodes, skip=True, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) return J_list, L_list
def ghavamzadeh_experiment(mdp, agent_plus, agent_cross, agent_high, n_epochs, n_episodes, ep_per_eval, ep_per_iteration_low): np.random.seed() computational_graph, control_blockH = build_ghavamzadeh_graph( mdp, agent_plus, agent_cross, agent_high, ep_per_iteration_low) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() epsilon_update = EpsilonUpdate(agent_high.policy) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if n == 4: control_blockH.callbacks = [epsilon_update] return J_list, L_list
def discretized_experiment(mdp, agent, n_actions, n_epochs, n_episodes, ep_per_eval, display, print_j, quiet): np.random.seed() computational_graph = build_computational_graph_discretized( mdp, agent, n_actions) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at start :', J_list[-1]) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=quiet) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at epoch ', n, ':', J_list[-1]) if display: core.evaluate(n_episodes=1, render=True) return J_list, L_list
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval, ep_per_fit_low, display, print_j, quiet): np.random.seed() dataset_callback = CollectDataset() computational_graph = build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, [dataset_callback]) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) J_low_list = list() L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at start :', J_list[-1]) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=quiet) ll_dataset = dataset_callback.get() dataset_callback.clean() J_low = compute_J(ll_dataset, mdp.info.gamma) J_low_list.append(np.mean(J_low)) if print_j: print('Low level reward at epoch', n, ':', np.mean(J_low)) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at epoch ', n, ':', J_list[-1]) if display: core.evaluate(n_episodes=1, render=True) return J_list, L_list, J_low_list
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=fit_params, approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))