def parallel_sampling_keepU(self, step, eta, run, rate, T=500, r0=.5, tm=20, ts=2,
                             reset=0, gamma=1, steps=100, start_state=0, mode='fix',
                             epsilon=0, max_iter=np.inf):
     np.random.seed(run)
     # maybe put this outside function to continue learning instead of fresh start
     self.unlearn(r0, mode)
     pTerminal = 1 - np.sum(self.W[:-self.pop_size, :-self.pop_size], axis=0)
     pi = np.zeros((steps, self.num_states), dtype=int)
     r = np.zeros(steps, dtype=float)
     DKL = np.zeros((steps, 2), dtype=float)
     RMSE = np.zeros((steps, 2), dtype=float)
     if isinstance(T, (int, long, float, complex)):
         Tmax = T
     else:
         Tmax = T[1]
     uinit = np.zeros(self.K)
     uinit[self.r] = rate / 1000.
     for t in xrange(steps):
         print 'run', run, '  trial', t, '/', steps
         stdout.flush()
         res = cfn.runpopU_js(self.W / self.pop_size, uinit, step, self.pop_size, rate,
                              Tmax, tm, ts, 1. * reset / self.pop_size, run)
         uinit = res[1]
         pi[t] = self.get_policy(step, res[0], T)
         for s in xrange(self.num_states):
             for a in xrange(self.num_actions):
                 ns, rr = self.next_SR(s, a)
                 pTerminal = self.update_weights_nonepisodic4DKL(
                     pTerminal, s, a, rr, ns, eta, gamma)
         r[t] = self.R4Pi(pi[t], start_state, gamma, epsilon, max_iter)
         dd = self.DKL4weights(self.W, pTerminal, gamma)
         DKL[t] = np.array([np.mean(dd[0]), np.mean(dd[1])])
         RMSE[t] = np.sqrt(self.MSE4weights(self.W, pTerminal, gamma))
     return np.array([pi, r, DKL, RMSE, np.copy(self.W)])
 def parallel_sampling_keepU(self, step, eta, run, rate, T=500, r0=.5, tm=20, ts=2,
                             reset=0, gamma=1, steps=100, start_state=0, mode='fix',
                             epsilon=0, max_iter=np.inf):
     np.random.seed(run)
     # maybe put this outside function to continue learning instead of fresh start
     self.unlearn(r0, mode)
     pTerminal = 1 - np.sum(self.W[:-self.pop_size, :-self.pop_size], axis=0)
     pi = np.zeros((steps, self.num_states), dtype=int)
     r = np.zeros(steps, dtype=float)
     DKL = np.zeros((steps, 2), dtype=float)
     RMSE = np.zeros((steps, 2), dtype=float)
     if isinstance(T, (int, long, float, complex)):
         Tmax = T
     else:
         Tmax = T[1]
     uinit = np.zeros(self.K)
     uinit[self.r] = rate / 1000.
     for t in xrange(steps):
         print 'run', run, '  trial', t, '/', steps
         stdout.flush()
         res = cfn.runpopU_js(self.W / self.pop_size, uinit, step, self.pop_size, rate,
                              Tmax, tm, ts, 1. * reset / self.pop_size, run)
         uinit = res[1]
         pi[t] = self.get_policy(step, res[0], T)
         for s in xrange(self.num_states):
             for a in xrange(self.num_actions):
                 ns, rr = self.next_SR(s, a)
                 pTerminal = self.update_weights_nonepisodic4DKL(
                     pTerminal, s, a, rr, ns, eta, gamma)
         r[t] = self.R4Pi(pi[t], start_state, gamma, epsilon, max_iter)
         dd = self.DKL4weights(self.W, pTerminal, gamma)
         DKL[t] = np.array([np.mean(dd[0]), np.mean(dd[1])])
         RMSE[t] = np.sqrt(self.MSE4weights(self.W, pTerminal, gamma))
     return np.array([pi, r, DKL, RMSE, np.copy(self.W)])
Beispiel #3
0
    def parallel_sampling_keepU(self, step, eta, run, rate, T=500, r0=.5, tm=20, ts=2,
                                reset=0, gamma=1, trials=1000, mode='fix',  maxsteps=300,
                                initpv=None, initW=None, samples=100):
        np.random.seed(run)
        if initW is None:
            self.unlearn(r0, mode)
        else:
            self.W = initW
        if isinstance(T, (int, long, float, complex)):
            Tmax = T
        else:
            Tmax = T[1]
        seq = [[None]] * trials  # sequences might have differnt length -> list instead array
        scount = np.zeros((trials, self.K))
        uinit = np.zeros(self.K)
        uinit[self.r] = rate / 1000.
        for t in xrange(trials):
            print 'run', run, '  trial', t, '/', trials
            stdout.flush()
            pvls = [[8, 16]] if initpv is None else [initpv]
            a = []
            r = []
            res = cfn.runpopU_js(self.W / self.pop_size, uinit, step, self.pop_size, rate,
                                 Tmax, tm, ts, 1. * reset / self.pop_size, run)
            uinit = res[1]
            scount[t] = np.sum(res[0], axis=0)
            for counter in xrange(maxsteps):
                a += [self.get_a(scount[t], *pvls[-1])]
                pvls += [cf.get_next_pv(pvls[-1][0], pvls[-1][1], a[-1])]
                r += [cf.get_R(*pvls[-1])]
            seq[t] = [pvls[:-1], a, r]
            for i in range(samples):
                p = 16 * np.random.rand()
                v = 32 * np.random.rand()
                for a in range(3):
                    pvs = cf.get_next_pvs(p, v, a, self.steps)
                    rr = cf.get_R(*pvs)
                    self.update_weights_continuous([p, v], a, rr, pvs,
                                                   eta, gamma**self.steps)

        return np.array([scount, np.array(seq), np.copy(self.W)])
          yerr=np.std(perf, axis=0) / np.sqrt(len(perf)))
pl.xticks([0, 50, 100], [0, 50, 100])
pl.yticks([0, .5, 1.0], [0, .5, 1.0])
pl.ylim([0, 1])
pl.xlabel('Trials')
pl.ylabel('Performance')
simpleaxis(pl.gca())
pl.tight_layout(0)
pl.savefig('learn_performance.pdf', dpi=600)


# initialize our network at two different points in the state space of its neural
# activities that correspond to representing the same (approximate) value function

uu0 = .2
res0 = [cfn.runpopU_js(W, uu0 * np.ones(net.K), step, 1, rate, 1000, 20, 2, ref, run)[0]
        for run in range(30)]
res1 = [cfn.runpopU_js(W, np.hstack([np.ravel(
    np.outer(np.ones(net.K / 3), [-12, uu0 * .75, 2.25 * uu0])), np.array([1])]),
    step, 1, rate, 1000, 20, 2, ref, run)[0]
    for run in range(30)]

pl.figure(figsize=(6, 6))
for i in range(4):
    p, v = [(0, 17), (8, 17), (5, 22), (2, 18), (10, 27)][i]
    pl.plot(smooth_spikes(np.sum([
        net.get_Q(s, p, v) for s in np.mean(res0, 0)], 1),
        40., step, 3 * uu0) / rate, c=col[i], zorder=10)
    pl.plot(smooth_spikes(np.sum([
        net.get_Q(s, p, v) for s in np.mean(res1, 0)], 1),
        40., step, 3 * uu0) / rate, '--', c=col[i], zorder=10)
          yerr=np.std(perf, axis=0) / np.sqrt(len(perf)))
pl.xticks([0, 50, 100], [0, 50, 100])
pl.yticks([0, .5, 1.0], [0, .5, 1.0])
pl.ylim([0, 1])
pl.xlabel('Trials')
pl.ylabel('Performance')
simpleaxis(pl.gca())
pl.tight_layout(0)
pl.savefig('learn_performance.pdf', dpi=600)

# initialize our network at two different points in the state space of its neural
# activities that correspond to representing the same (approximate) value function

uu0 = .2
res0 = [
    cfn.runpopU_js(W, uu0 * np.ones(net.K), step, 1, rate, 1000, 20, 2, ref,
                   run)[0] for run in range(30)
]
res1 = [
    cfn.runpopU_js(
        W,
        np.hstack([
            np.ravel(np.outer(np.ones(net.K / 3),
                              [-12, uu0 * .75, 2.25 * uu0])),
            np.array([1])
        ]), step, 1, rate, 1000, 20, 2, ref, run)[0] for run in range(30)
]

pl.figure(figsize=(6, 6))
for i in range(4):
    p, v = [(0, 17), (8, 17), (5, 22), (2, 18), (10, 27)][i]
    pl.plot(smooth_spikes(