def log_prob(w, D):
    agent = RWSoftmaxAgent(task=MyBanditTask(),
                           learning_rate=sigmoid(w[0]),
                           inverse_softmax_temp=stable_exp(w[1]))
    for t in range(D.shape[0]):
        x = D[t, :7]
        u = D[t, 7:11]
        r = D[t, 11]
        x_ = D[t, 12:]
        agent.log_prob(x, u)
        agent.learning(x, u, r, x_, None)
    J = np.diag([grad.sigmoid(w[0]), grad.exp(w[1])])
    return -agent.logprob_, -J @ agent.grad_
Ejemplo n.º 2
0
def log_prob(w, D):
    agent = RWSoftmaxAgent(task=MyBanditTask(),
                           learning_rate=w[0],
                           inverse_softmax_temp=w[1])
    L = 0
    for t in range(D.shape[0]):
        x = D[t, :7]
        u = D[t, 7:11]
        r = D[t, 11]
        x_ = D[t, 12:]
        L += u @ agent.log_prob(x)
        agent.learning(x, u, r, x_, None)
    return L
Ejemplo n.º 3
0
def log_prob(w, D):
    lr = sigmoid(w[0], a_min=-6, a_max=6)
    ist = relu(w[1], a_max=10)
    agent = RWSoftmaxAgent(TwoArmedBandit(), lr, ist)
    L = 0
    for t in range(D.shape[0]):
        x = D[t, :3]
        u = D[t, 3:5]
        r = D[t, 5]
        x_ = D[t, 6:]
        L += u @ agent.log_prob(x)
        agent.learning(x, u, r, x_, None)
    return L
Ejemplo n.º 4
0
def log_prob(w, D):
    lr = sigmoid(w[0], a_min=-6, a_max=6)
    ist = stable_exp(w[1], a_min=-10, a_max=10)
    agent = RWSoftmaxAgent(TwoArmedBandit(), lr, ist)
    L = 0
    for t in range(D.shape[0]):
        x = D[t, :3]
        u = D[t, 3:5]
        r = D[t, 5]
        x_ = D[t, 6:]
        agent.log_prob(x, u)
        agent.learning(x, u, r, x_, None)
    J = np.array([grad.sigmoid(w[0]), grad.exp(w[1])])
    return -agent.logprob_, -J * agent.grad_,
def rwsoftmax_loglik(w, D):
    X1, U1, R, X2 = D[:, :nx], D[:,
                                 nx:nx + nu], D[:,
                                                nx + nu], D[:, nx + nu + 1:nx +
                                                            nu + 1 + nx]
    w = fu.transform(w, [fu.sigmoid, np.exp]).flatten()
    J = reparam_jac_rwsm(w)
    q = RWSoftmaxAgent(task=task(),
                       learning_rate=w[0],
                       inverse_softmax_temp=w[1])
    ntrials = X1.shape[0]
    for t in range(ntrials):
        q.log_prob(X1[t], U1[t])
        q.learning(X1[t], U1[t], R[t], X2[t], None)
    L = q.logprob_
    return -L, -J @ q.grad_, -J.T @ q.hess_ @ J
Ejemplo n.º 6
0
def test_rwsoftmaxagent():
    lr = 0.1
    B = 1.5
    task = TwoArmedBandit()
    q = RWSoftmaxAgent(task, learning_rate=lr, inverse_softmax_temp=B)

    x = np.array([1., 0., 0.])
    u1 = np.array([1., 0.])
    u2 = np.array([0., 1.])
    x_1 = np.array([0., 1., 0.])
    x_2 = np.array([0., 0., 1.])
    r1 = 1.0
    r2 = 0.0

    q.log_prob(x, u1)
    q.learning(x, u1, r1, x_1, None)
    q.log_prob(x, u2)
    q.learning(x, u2, r2, x_2, None)
    q.log_prob(x, u2)
    q.learning(x, u2, r1, x_1, None)
    q.log_prob(x, u1)
    q.learning(x, u1, r2, x_2, None)
    q.log_prob(x, u1)
    q.learning(x, u1, r1, x_1, None)

    fitr_grad = q.grad_
    fitr_hess = q.hess_

    def f(w):
        m = RWSoftmaxAgent(task, learning_rate=w[0], inverse_softmax_temp=w[1])
        m._log_prob_noderivatives(x, u1)
        m.critic._update_noderivatives(x, u1, r1, x_1, None)
        m._log_prob_noderivatives(x, u2)
        m.critic._update_noderivatives(x, u2, r2, x_2, None)
        m._log_prob_noderivatives(x, u2)
        m.critic._update_noderivatives(x, u2, r1, x_1, None)
        m._log_prob_noderivatives(x, u1)
        m.critic._update_noderivatives(x, u1, r2, x_2, None)
        m._log_prob_noderivatives(x, u1)
        m.critic._update_noderivatives(x, u1, r1, x_1, None)
        return m.logprob_

    agJ = jacobian(f)(np.array([lr, B]))
    agH = hessian(f)(np.array([lr, B]))

    assert (np.linalg.norm(agJ - q.grad_))
    assert (np.linalg.norm(agH - q.hess_))
Ejemplo n.º 7
0
def f():
    agent = RWSoftmaxAgent(task, learning_rate=0.4, inverse_softmax_temp=2.6)
    for t in range(X.shape[0]):
        agent.log_prob(X[t], U[t])
        agent.learning(X[t], U[t], R[t], X_[t], None)
    return agent.logprob_