Esempio n. 1
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
    dataset = collect_dataset.get()
    return agent.Q.table
Esempio n. 2
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.approximator, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
Esempio n. 3
0
def experiment_others(alg, decay_exp):
    np.random.seed()

    # MDP

    grid_map = "simple_gridmap.txt"
    mdp = GridWorldGenerator(grid_map=grid_map)

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                             size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size)

    algorithm_params = dict(learning_rate=alpha)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = alg(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
Esempio n. 4
0
def experiment(policy, value):
    np.random.seed(45)

    # MDP
    mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5))

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = SARSA(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 2000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Esempio n. 5
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high,
                                     phi, phi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps / n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
Esempio n. 6
0
def experiment(decay_exp, windowed, tol):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)
    if windowed:
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=tol,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=tol)
    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q,
                                mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval,
               ep_per_fit_low, display, print_j, quiet):
    np.random.seed()

    dataset_callback = CollectDataset()

    computational_graph = build_computational_graph(mdp, agent_low, agent_high,
                                                    ep_per_fit_low,
                                                    [dataset_callback])

    core = HierarchicalCore(computational_graph)
    J_list = list()
    L_list = list()

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    J_low_list = list()
    L = episodes_length(dataset)
    L_list.append(np.mean(L))
    if print_j:
        print('Reward at start :', J_list[-1])

    for n in range(n_epochs):
        core.learn(n_episodes=n_episodes, skip=True, quiet=quiet)

        ll_dataset = dataset_callback.get()
        dataset_callback.clean()
        J_low = compute_J(ll_dataset, mdp.info.gamma)
        J_low_list.append(np.mean(J_low))
        if print_j:
            print('Low level reward at epoch', n, ':', np.mean(J_low))

        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))

        if print_j:
            print('Reward at epoch ', n, ':', J_list[-1])

        if display:
            core.evaluate(n_episodes=1, render=True)

    return J_list, L_list, J_low_list
Esempio n. 8
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(dist, policy, mdp.info, **params)

    # Train
    print(alg.__name__)
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        print('mu:    ', p[:n_weights])
        print('sigma: ', p[n_weights:])
        print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks=callbacks)

    # Train
    core.learn(n_episodes=20, n_steps_per_fit=1, render=0)

    dataset = collect_dataset.get()
    return np.mean(compute_J(dataset, 1.))
Esempio n. 10
0
def experiment(policy, value):
    np.random.seed()

    # MDP
    mdp = generate_taxi('grid.txt')

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = SARSA(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 300000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Esempio n. 11
0
def experiment2():
    np.random.seed(3)
    print('mushroom     :')

    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=1.,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Train
    dataset = collect_dataset.get()
    VisualizeControlBlock(dataset)
    return agent.Q.table
Esempio n. 12
0
def experiment(policy, name, alg_version):
    np.random.seed()

    # MDP

    if name == "Taxi":
        mdp = generate_taxi('../grid.txt')
        max_steps = 100000
        evaluation_frequency = 2000
        test_samples = 10000
    elif name == "NChain-v0":
        mdp = generate_chain(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "Loop":
        mdp = generate_loop(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    elif name == "SixArms":
        mdp = generate_arms(horizon=1000)
        max_steps = 25000
        evaluation_frequency = 500
        test_samples = 10000
    elif name == "RiverSwim":
        mdp = generate_river(horizon=1000)
        max_steps = 5000
        evaluation_frequency = 100
        test_samples = 10000
    else:
        raise NotImplementedError
    # Policy
    # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5,
    #                                     size=mdp.info.observation_space.size)
    epsilon_train = ExponentialDecayParameter(
        value=1., decay_exp=.5, size=mdp.info.observation_space.size)
    epsilon_test = Parameter(0)
    pi = policy(epsilon=epsilon_train)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=.2,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)
    scores = list()
    scores_train = list()
    # Train
    for n_epoch in range(1, max_steps // evaluation_frequency + 1):
        print('- Learning:')
        # learning step
        pi.set_epsilon(epsilon_train)
        core.learn(n_steps=evaluation_frequency,
                   n_steps_per_fit=1,
                   quiet=False)
        dataset = collect_dataset.get()
        if name == "Taxi":
            scores_train.append(get_stats(dataset))
        elif name in ["SixArms"]:
            scores_train.append(compute_scores_Loop(dataset, horizon=500))
        else:
            scores_train.append(compute_scores_Loop(dataset))
        collect_dataset.clean()
        mdp.reset()
        print('- Evaluation:')
        # evaluation step
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=test_samples, quiet=False)
        mdp.reset()
        scores.append(get_stats(dataset))
        #np.save(env + '/'+alg_version+'_scores.npy', scores)

    return scores_train, scores
Esempio n. 13
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = SAC_AVG(policy, mdp.info,
                    alpha_theta, alpha_v, alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
Esempio n. 14
0
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

approximator_params = dict(input_shape=(features.size, ),
                           output_shape=(mdp.info.action_space.n, ),
                           n_actions=mdp.info.action_space.n)

# Agent
learning_rate = Parameter(.1 / n_tilings)
algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
fit_params = dict()
agent_params = {
    'approximator_params': approximator_params,
    'algorithm_params': algorithm_params,
    'fit_params': fit_params
}
agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, agent_params,
                              features)

# Algorithm
collect_dataset = CollectDataset()
callbacks = [collect_dataset]
core = Core(agent, mdp, callbacks=callbacks)

# Train
core.learn(n_episodes=100, n_steps_per_fit=1)

# Evaluate
core.evaluate(n_episodes=1, render=True)
Esempio n. 15
0
def experiment():
    np.random.seed()

    # Model Block
    mdp = ShipSteeringMultiGate()

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)', phi=phi)

    # Function Block 2
    function_block2 = squarednormBlock(name='f2 (squared norm)')

    # Function Block 3
    function_block3 = addBlock(name='f3 (summation)')

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([38, 38])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([75, 75]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1)

    # Policy 2
    sigma2 = Parameter(value=.01)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(1, ),
                              output_shape=mdp.info.action_space.shape)
    pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2)

    # Agent 1
    learning_rate = AdaptiveParameter(value=10)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, 150, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=50)
    agent1 = GPOMDP(policy=pi1,
                    mdp_info=mdp_info_agent1,
                    params=agent_params,
                    features=features)

    # Agent 2
    learning_rate = AdaptiveParameter(value=.001)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = GPOMDP(policy=pi2,
                    mdp_info=mdp_info_agent2,
                    params=agent_params,
                    features=None)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=5,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    dataset_callback = CollectDataset()
    parameter_callback2 = CollectPolicyParameter(pi2)
    control_block2 = ControlBlock(
        name='Control Block 2',
        agent=agent2,
        n_eps_per_fit=10,
        callbacks=[dataset_callback, parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, control_block1, control_block2, function_block1,
        function_block2, function_block3, reward_acc
    ]
    #order = [0, 1, 7, 2, 4, 5, 6, 3]
    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)
    function_block3.add_input(function_block2)
    function_block3.add_input(reward_ph)
    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block3)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    #dataset_learn_visual = core.learn(n_episodes=2000)
    dataset_learn_visual = list()
    for n in range(4):
        dataset_learn = core.learn(n_episodes=500)
        last_ep_dataset = pick_last_ep(dataset_learn)
        dataset_learn_visual += last_ep_dataset
        del dataset_learn

    # Evaluate
    dataset_eval = core.evaluate(n_episodes=10)

    # Visualize
    low_level_dataset = dataset_callback.get()
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()
    visualize_policy_params(parameter_dataset1, parameter_dataset2)
    visualize_control_block(low_level_dataset, ep_count=20)
    visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4)

    visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4)
    plt.show()

    return
Esempio n. 16
0
File: run.py Progetto: czgdp1807/wql
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min,
               lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double,
               regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir,
               collect_qs,  seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)
    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Gridworld':
        mdp = generate_gridworld(horizon=100, gamma=0.99)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.4
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
        mbie_C = 0.8
    elif name == 'ThreeArms':
        horizon = 100
        mdp = generate_three_arms(horizon=horizon, gamma=0.99)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    if regret_test:

        max_steps = int(args.max_steps_regret * 1e6)
        evaluation_frequency = max_steps // 100
        test_samples = 1000
        if debug:
            max_steps = 100000
            evaluation_frequency =  max_steps // 100
            test_samples = 1000
        
    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn('QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5,
                                                      size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn('Bootstrapped QL available with only boot and weighted policies!')
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=(q_max - q_min)/2,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)

        epsilon_train = Parameter(0)
    elif algorithm == 'r-max':
        thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3))

        algorithm_params = dict(
            rmax=R,
            s_a_threshold=r_max_m
        )
        agent = RMaxAgent(mdp.info, **algorithm_params)
        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'mbie':


        algorithm_params = dict(
            rmax=R,
            C=mbie_C,
            value_iterations=value_iterations,
            tolerance=tolerance
        )
        agent = MBIE_EB(mdp.info, **algorithm_params)

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        theoretic_m = delayed_m
        if regret_test:
            gamma = mdp.info.gamma
            Vmax = R / (1 - gamma)
            epsilon = args.delayed_ratio * Vmax
            delayed_epsilon = epsilon*(1-gamma)
            delta = 0.1
            S, A = mdp.info.size

            theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma))))
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Vmax:{}".format(Vmax))
                print("Gamma:{}".format(mdp.info.gamma))
                print("Epsilon:{}".format(epsilon))
                #print("k:{}".format(k))
                print("m:{}".format(theoretic_m))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])
                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1))
                I = np.diag(np.ones(S))
                V = np.linalg.solve(I - gamma * P_pi, R_pi)

                return V
        algorithm_params = dict(
            R=R,
            m=theoretic_m,
            delta=delta,
            epsilon=delayed_epsilon,
            **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        if regret_test:
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection)
            if debug:
                print("Q:")
                print(agent.get_approximator()[:, :])
                print("Policy:")
                print(agent.get_policy())
                print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
                input()

        pi = agent
        epsilon_train = Parameter(0)
    elif algorithm == 'gaussian-ql':
        if policy not in ['weighted-gaussian', 'ucb']:
            warnings.warn('Particle QL available with only ucb and weighted policies!')
            policy = 'weighted-gaussian'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma))
        else:
            pi = policy_dict[policy]()
        q_0 = (q_max - q_min) / 2
        sigma_0 = (q_max - q_min) / np.sqrt(12)
        C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0)
        sigma_lr = None
        if log_lr:
            sigma_lr = LogarithmicDecayParameter(value=1., C=C,
                                             size=mdp.info.size)
        init_values = (q_0, sigma_0)
        if regret_test:
            sigma_lr = None
            gamma = mdp.info.gamma
            T = max_steps
            S, A = mdp.info.size
            a = (2 + gamma) / (2 *(1 - gamma))
            b = a - 1
            c = 1
            d = b
            q_max = R / (1 - gamma)
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            #first_fac = np.sqrt(b + T)
            #second_fac = np.sqrt(a * np.log(S*A*T / delta))
            #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta)))

            q_0 = q_max
            sigma1_0 = 0
            #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor

            sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta))
            init_values = (q_0, sigma1_0, sigma2_0)
            learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1,
                                                 size=mdp.info.size)
            algorithm_params = dict(learning_rate=learning_rate,
                                    sigma_1_learning_rate=learning_rate_sigma1)

            sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size)
            def evaluate_policy(P, R, policy):

                P_pi = np.zeros((S, S))
                R_pi = np.zeros(S)

                for s in range(S):
                    for s1 in range(S):
                        P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1])

                    R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1))
                I = np.diag(np.ones(S))

                V = np.linalg.solve(I - gamma * P_pi, R_pi)
                return V
            if debug:
                print("Delta:{}".format(delta))
                print("R:{}".format(R))
                print("Gamma:{}".format(mdp.info.gamma))
                print("mu0:{}".format(q_0))
                print("Sigma1_0:{}".format(sigma1_0))
                print("Sigma2_0:{}".format(sigma2_0))
                print("a:{}".format(a))
                print("b:{}".format(b))
                print("c:{}".format(c))
                print("d:{}".format(d))
                print("T:{}".format(T))
                print("S:{}".format(S))
                print("A:{}".format(A))
                input()




        algorithm_params = dict(
            update_mode=update_mode,
            update_type=update_type,
            sigma_learning_rate=sigma_lr,
            init_values=init_values,
            delta=delta,
            q_max=q_max,
            **algorithm_params)
        if double and not regret_test:
            agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = GaussianQLearning(pi, mdp.info, **algorithm_params)
        if regret_test:
            if debug:
                freq = 10
            else:
                freq = args.freq_collection
            collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq)
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy())))
            input()
        if policy == 'ucb':
            q = agent.approximator
            standard_bound = norm.ppf(1 - delta, loc=0, scale=1)
            def quantile_func(state):
                means = np.array(q.predict(state, idx=0))
                if regret_test:
                    sigmas1 = np.array(q.predict(state, idx=1))
                    sigmas2 = np.array(q.predict(state, idx=2))
                    sigmas = sigmas2
                    #print(sigmas1, sigmas2)
                else:
                    sigmas = np.array(q.predict(state, idx=1))
                out = sigmas * standard_bound + means
                return out

            def mu(state):
                q_list = q.predict(state, idx=0)
                means = np.array(q_list)

                return means
            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        if algorithm not in ['r-max']:
            collect_qs_callback = CollectQs(agent.approximator)
            callbacks += [collect_qs_callback]

    if regret_test:
        callbacks += [collect_vs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        if regret_test:
            collect_vs_callback.on()
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        #print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()
        if regret_test:
            vs = collect_vs_callback.get_values()
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
            print("Finished {} steps.".format(n_epoch * evaluation_frequency))
            np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs)
            np.save(out_dir+"/scores_online" + str(seed), train_scores)
            collect_vs_callback.off()
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        s = mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        print('Evaluation #%d:%s ' %(n_epoch, scores))
        if debug:
            print("Policy:")
            print(agent.get_policy())
            print("Q")
            for state in range(S):
                means = np.array(agent.approximator.predict(np.array([state]), idx=0))
                sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1))
                sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2))
                print("Means:{}".format(means))
                print("Sigmas1:{}".format(sigmas1))
                print("Sigmas2:{}".format(sigmas2))
            print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy())))
            input()
        test_scores.append(scores)
        if regret_test:
            np.save(out_dir + "/scores_offline" + str(seed), test_scores)
    if collect_qs:
        qs= collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
                            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)

    return train_scores, test_scores
Esempio n. 17
0
def experiment(algorithm,
               name,
               update_mode,
               update_type,
               policy,
               n_approximators,
               q_max,
               q_min,
               lr_exp,
               file_name,
               out_dir,
               particles,
               R=1,
               m=1,
               collect_qs=False,
               seed=0):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = None
        try:
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        except:
            register(
                id='KnightQuest-v0',
                entry_point='envs.knight_quest:KnightQuest',
            )
            mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'particle-ql':
        delta = 0.1
        if policy not in ['weighted', 'vpi', 'ucb']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        if policy == 'ucb':
            pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma))
        else:
            pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                delta=delta,
                                init_values=particles,
                                **algorithm_params)

        agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        if policy == 'ucb':
            q = agent.approximator
            quantiles = [
                i * 1. / (n_approximators - 1) for i in range(n_approximators)
            ]
            for p in range(n_approximators):
                if quantiles[p] >= 1 - delta:
                    particle_bound = p
                    break

            def quantile_func(state, quantile):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                out = np.zeros(qs.shape[1])
                out[:] = qs[particle_bound, :]
                return out

            def mu(state):
                q_list = list()
                for i in range(n_approximators):
                    q_list.append(q.predict(state, idx=i))
                qs = np.array(q_list)
                return np.mean(qs, axis=0)

            pi.set_quantile_func(quantile_func)
            pi.set_mu(mu)
        epsilon_train = Parameter(0)
    elif algorithm == 'delayed-ql':
        algorithm_params = dict(R=R, m=m, **algorithm_params)

        agent = DelayedQLearning(mdp.info, **algorithm_params)
        pi = agent

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    if collect_qs:
        collect_qs_callback = CollectQs(agent.approximator)
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores
Esempio n. 18
0
def experiment(algorithm, name, update_mode, update_type, policy,
               n_approximators, q_max, q_min, lr_exp, double, file_name,
               out_dir, collect_qs, seed):
    set_global_seeds(seed)
    print('Using seed %s' % seed)

    # MDP
    if name == 'Taxi':
        mdp = generate_taxi('../../grid.txt', horizon=5000)
        max_steps = 500000
        evaluation_frequency = 5000
        test_samples = 5000
    elif name == 'Chain':
        mdp = generate_chain(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'Loop':
        mdp = generate_loop(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'RiverSwim':
        mdp = generate_river(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'SixArms':
        mdp = generate_arms(horizon=100)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    elif name == 'KnightQuest':
        mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000)
        max_steps = 100000
        evaluation_frequency = 1000
        test_samples = 1000
    else:
        raise NotImplementedError

    epsilon_test = Parameter(0)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=lr_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)

    if algorithm == 'ql':
        if policy not in ['boltzmann', 'eps-greedy']:
            warnings.warn(
                'QL available with only boltzmann and eps-greedy policies!')
            policy = 'eps-greedy'

        if policy == 'eps-greedy':
            epsilon_train = ExponentialDecayParameter(
                value=1., decay_exp=.5, size=mdp.info.observation_space.size)
            pi = policy_dict[policy](epsilon=epsilon_train)
        else:
            beta_train = ExponentialDecayParameter(
                value=1.5 * q_max,
                decay_exp=.5,
                size=mdp.info.observation_space.size)
            pi = policy_dict[policy](beta=beta_train)
        if double:
            agent = DoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = QLearning(pi, mdp.info, **algorithm_params)
    elif algorithm == 'boot-ql':
        if policy not in ['boot', 'weighted']:
            warnings.warn(
                'Bootstrapped QL available with only boot and weighted policies!'
            )
            policy = 'boot'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                mu=(q_max + q_min) / 2,
                                sigma=q_max - q_min,
                                **algorithm_params)
        if double:
            agent = BootstrappedDoubleQLearning(pi, mdp.info,
                                                **algorithm_params)
        else:
            agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    elif algorithm == 'particle-ql':
        if policy not in ['weighted', 'vpi']:
            warnings.warn(
                'Particle QL available with only vpi and weighted policies!')
            policy = 'weighted'
        pi = policy_dict[policy](n_approximators=n_approximators)
        algorithm_params = dict(n_approximators=n_approximators,
                                update_mode=update_mode,
                                update_type=update_type,
                                q_max=q_max,
                                q_min=q_min,
                                **algorithm_params)
        if double:
            agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params)
        else:
            agent = ParticleQLearning(pi, mdp.info, **algorithm_params)
        epsilon_train = Parameter(0)
    else:
        raise ValueError()

    # Algorithm
    collect_dataset = CollectDataset()
    collect_qs_callback = CollectQs(agent.approximator)
    callbacks = [collect_dataset]
    if collect_qs:
        callbacks += [collect_qs_callback]
    core = Core(agent, mdp, callbacks)

    train_scores = []
    test_scores = []

    for n_epoch in range(1, max_steps // evaluation_frequency + 1):

        # Train
        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_train)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(False)
        core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True)
        dataset = collect_dataset.get()
        scores = compute_scores(dataset, mdp.info.gamma)

        # print('Train: ', scores)
        train_scores.append(scores)

        collect_dataset.clean()
        mdp.reset()

        if hasattr(pi, 'set_epsilon'):
            pi.set_epsilon(epsilon_test)
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        dataset = core.evaluate(n_steps=test_samples, quiet=True)
        mdp.reset()
        scores = compute_scores(dataset, mdp.info.gamma)
        # print('Evaluation: ', scores)
        test_scores.append(scores)
    if collect_qs:
        qs = collect_qs_callback.get_values()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        np.save(out_dir + '/' + file_name, qs)
    return train_scores, test_scores