Ejemplo n.º 1
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
Ejemplo n.º 2
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
Ejemplo n.º 3
0
def experiment1(decay_exp, beta_type):
    np.random.seed()

    # MDP
    p = np.load('p.npy')
    rew = np.load('rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)

    if beta_type == 'Win':
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=10.,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=10.)

    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_q = CollectQ(agent.Q)
    collect_lr_1 = CollectParameters(beta, np.array([0]))
    collect_lr_5 = CollectParameters(beta, np.array([4]))
    callbacks = [collect_q, collect_lr_1, collect_lr_5]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_q.get_values()
    lr_1 = collect_lr_1.get_values()
    lr_5 = collect_lr_5.get_values()

    return Qs, lr_1, lr_5