def update_weights_continuous(self, s, a, r, ns, eta, gamma=1): """ Update weights given state, action, reward and next state using delta rule update s, a, r, ns: state, action, reward, next state eta: learning rate gamma: discount factor """ # (neuronindex/popsize)/num_actions = stateindex # (neuronindex/popsize)%num_actions = actionindex # stateindex/17 = positionindex # stateindex%17 = velocityindex p, v = s pp, vv = ns Swphi = np.dot((self.W + self.competition).T[:, :-1], np.array([0 if (a != i / self.pop_size % 3) else cf.phi((p - i / self.pop_size / (3 * self.num_v) + 8) % 16 - 8) * cf.phi(v - (i / self.pop_size / 3) % self.num_v) for i in xrange(self.K - 1)])) Intphia = [cf.phi((pp - i / self.pop_size / (3 * self.num_v) + 8) % 16 - 8) * cf.phi(vv - (i / self.pop_size / 3) % self.num_v) * gamma for i in xrange(self.K - 1)] Intphia += [r] factor = eta * (np.array(Intphia) - Swphi) for i in xrange(self.num_states): # skip update for neurons 'far away' if abs(i / self.num_v - s[0]) > 2 or abs(i % self.num_v - s[1]) > 2: continue for k in xrange(self.pop_size): self.W[self.sa[i, a, k]] += factor *\ cf.phi((p - i / self.num_v + 8) % 16 - 8) *\ cf.phi(v - i % self.num_v)
def get_Q(self, x, p, v): return np.dot(np.mean((x * (x > 0))[self.sa], axis=2).T, np.array([cf.phi((p - i / self.num_v + 8) % 16 - 8) * cf.phi(v - i % self.num_v) for i in xrange(self.num_states)]))