def test_asymmetric_rwlearner(): task = TwoArmedBandit() critic = AsymmetricRescorlaWagnerLearner(task, learning_rate_pos=0.1, learning_rate_neg=0.1) x = task.observation() u = task.random_action() x_, r, _ = task.step(u) critic.update(x, u, r, x_, None)
def test_forgetful_rwlearner(): task = TwoArmedBandit() critic = ForgetfulInstrumentalRescorlaWagnerLearner(task, learning_rate=0.1, memory_decay=0.1) x = task.observation() u = task.random_action() x_, r, _ = task.step(u) critic.update(x, u, r, x_, None)
def test_grad_instrumantalrwupdate(): lr = 0.1 task = TwoArmedBandit() q = InstrumentalRescorlaWagnerLearner(task, learning_rate=lr) x = np.array([1., 0., 0.]) u1 = np.array([1., 0.]) u2 = np.array([0., 1.]) x_1 = np.array([0., 1., 0.]) x_2 = np.array([0., 0., 1.]) r1 = 1.0 r2 = 0.0 q.update(x, u1, r1, x_1, None) q.update(x, u2, r2, x_2, None) q.update(x, u2, r1, x_1, None) q.update(x, u1, r2, x_2, None) q.update(x, u1, r1, x_1, None) fitr_grad = q.dQ['learning_rate'] fitr_hess = q.hess_Q['learning_rate'] def fq(lr): m = InstrumentalRescorlaWagnerLearner(task, learning_rate=lr) m._update_noderivatives(x, u1, r1, x_1, None) m._update_noderivatives(x, u2, r2, x_2, None) m._update_noderivatives(x, u2, r1, x_1, None) m._update_noderivatives(x, u1, r2, x_2, None) m._update_noderivatives(x, u1, r1, x_1, None) return m.Q agQ = jacobian(fq)(lr) ahQ = hessian(fq)(lr) assert (np.linalg.norm(fitr_grad - agQ) < 1e-6) assert (np.linalg.norm(fitr_hess - ahQ) < 1e-6)
def test_rwstickysoftmaxagent(): lr = 0.1 B = 1.5 p = 0.01 task = TwoArmedBandit() q = RWStickySoftmaxAgent(task, learning_rate=lr, inverse_softmax_temp=B, perseveration=p) x = np.array([1., 0., 0.]) u1 = np.array([1., 0.]) u2 = np.array([0., 1.]) x_1 = np.array([0., 1., 0.]) x_2 = np.array([0., 0., 1.]) r1 = 1.0 r2 = 0.0 q.log_prob(x, u1) q.learning(x, u1, r1, x_1, None) q.log_prob(x, u2) q.learning(x, u2, r2, x_2, None) q.log_prob(x, u2) q.learning(x, u2, r1, x_1, None) q.log_prob(x, u1) q.learning(x, u1, r2, x_2, None) q.log_prob(x, u1) q.learning(x, u1, r1, x_1, None) def f(w): m = RWStickySoftmaxAgent(task, learning_rate=w[0], inverse_softmax_temp=w[1], perseveration=w[2]) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r1, x_1, None) m._log_prob_noderivatives(x, u2) m.critic._update_noderivatives(x, u2, r2, x_2, None) m._log_prob_noderivatives(x, u2) m.critic._update_noderivatives(x, u2, r1, x_1, None) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r2, x_2, None) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r1, x_1, None) return m.logprob_ w = np.array([lr, B, p]) ag = jacobian(f)(w) aH = hessian(f)(w) assert (np.linalg.norm(q.grad_ - ag) < 1e-6) assert (np.linalg.norm(q.hess_ - aH) < 1e-6)
def test_grad_Qx(): x = np.array([1., 0., 0.]) task = TwoArmedBandit() v = ValueFunction(task) v.Q = np.array([[1., 2., 3.], [4., 5., 6.]]) def vfx(Q): v.Q = Q return v.Qx(x) agQx = elementwise_grad(vfx)(v.Q) gQ = v.grad_Qx(x) assert (np.linalg.norm(agQx - gQ) < 1e-5)
def test_grad_Vx(): x = np.array([1., 0., 0.]) task = TwoArmedBandit() v = ValueFunction(task) v.V = np.array([1., 2., 3.]) def vfx(V): v.V = V return v.Vx(x) agVx = elementwise_grad(vfx)(v.V) gV = v.grad_Vx(x) assert (np.linalg.norm(agVx - gV) < 1e-5)
def log_prob(w, D): lr = sigmoid(w[0], a_min=-6, a_max=6) ist = relu(w[1], a_max=10) agent = RWSoftmaxAgent(TwoArmedBandit(), lr, ist) L = 0 for t in range(D.shape[0]): x = D[t, :3] u = D[t, 3:5] r = D[t, 5] x_ = D[t, 6:] L += u @ agent.log_prob(x) agent.learning(x, u, r, x_, None) return L
def log_prob(w, D): lr = sigmoid(w[0], a_min=-6, a_max=6) ist = stable_exp(w[1], a_min=-10, a_max=10) agent = RWSoftmaxAgent(TwoArmedBandit(), lr, ist) L = 0 for t in range(D.shape[0]): x = D[t, :3] u = D[t, 3:5] r = D[t, 5] x_ = D[t, 6:] agent.log_prob(x, u) agent.learning(x, u, r, x_, None) J = np.array([grad.sigmoid(w[0]), grad.exp(w[1])]) return -agent.logprob_, -J * agent.grad_,
def test_rwsoftmaxagent(): lr = 0.1 B = 1.5 task = TwoArmedBandit() q = RWSoftmaxAgent(task, learning_rate=lr, inverse_softmax_temp=B) x = np.array([1., 0., 0.]) u1 = np.array([1., 0.]) u2 = np.array([0., 1.]) x_1 = np.array([0., 1., 0.]) x_2 = np.array([0., 0., 1.]) r1 = 1.0 r2 = 0.0 q.log_prob(x, u1) q.learning(x, u1, r1, x_1, None) q.log_prob(x, u2) q.learning(x, u2, r2, x_2, None) q.log_prob(x, u2) q.learning(x, u2, r1, x_1, None) q.log_prob(x, u1) q.learning(x, u1, r2, x_2, None) q.log_prob(x, u1) q.learning(x, u1, r1, x_1, None) fitr_grad = q.grad_ fitr_hess = q.hess_ def f(w): m = RWSoftmaxAgent(task, learning_rate=w[0], inverse_softmax_temp=w[1]) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r1, x_1, None) m._log_prob_noderivatives(x, u2) m.critic._update_noderivatives(x, u2, r2, x_2, None) m._log_prob_noderivatives(x, u2) m.critic._update_noderivatives(x, u2, r1, x_1, None) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r2, x_2, None) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r1, x_1, None) return m.logprob_ agJ = jacobian(f)(np.array([lr, B])) agH = hessian(f)(np.array([lr, B])) assert (np.linalg.norm(agJ - q.grad_)) assert (np.linalg.norm(agH - q.hess_))
def test_two_armed_bandit(): task = TwoArmedBandit() x = task.observation() u = task.random_action()
import cProfile import numpy as np from fitr import utils as fu from fitr.environments import TwoArmedBandit from fitr.agents import RWSoftmaxAgent task = TwoArmedBandit(rng=np.random.RandomState(743)) agent = RWSoftmaxAgent(task, learning_rate=0.4, inverse_softmax_temp=2.6) # Generate data data = agent.generate_data(ntrials=1000) unpacked_data = data.unpack_tensor(task.nstates, task.nactions) X, U, R, X_, _, _ = [np.squeeze(di) for di in unpacked_data] def f(): agent = RWSoftmaxAgent(task, learning_rate=0.4, inverse_softmax_temp=2.6) for t in range(X.shape[0]): agent.log_prob(X[t], U[t]) agent.learning(X[t], U[t], R[t], X_[t], None) return agent.logprob_ cProfile.run('agent.log_prob(X[5], U[5])', sort='time') cProfile.run('f()', sort='time')