def gradient(self, x, g, returnError=True): x = np.asarray(x) g = np.asarray(g) if self.flattenOut: g = g.ravel() # packed views of the hidden and visible gradient matrices views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] hgs = views[1:-1] vg = views[-1] # forward pass z1 = util.bias(x) z1s = [z1] zPrimes = [] for hw, phi in zip(self.hws, self.transFunc): h = z1.dot(hw) z1 = util.bias(phi(h)) z1s.append(z1) zPrime = phi(h, 1) zPrimes.append(zPrime) y = z1.dot(self.vw) if self.flattenOut: y = y.ravel() # error components e = util.colmat(y - g) delta = np.sign(e) / e.size # visible layer gradient vg[...] = z1.T.dot(delta) vg += self.penaltyGradient(-1) # backward pass for hidden layers w = self.vw for l in range(self.nHLayers - 1, -1, -1): delta = delta.dot(w[:-1, :].T) * zPrimes[l] hgs[l][...] = z1s[l].T.dot(delta) hgs[l] += self.penaltyGradient(l) w = self.hws[l] if returnError: error = np.mean(np.abs(e)) + self.penaltyError() return error, pg else: return pg
def train(self, x, g): x = np.asarray(x) g = np.asarray(g) x1 = util.bias(x) penaltyMat = self.penalty * np.eye(x1.shape[1], dtype=self.dtype) penaltyMat[-1, -1] = 0.0 a = x1.T @ x1 + penaltyMat b = x1.T @ g if self.pseudoInv is None: if np.linalg.cond(a) < 1.0 / np.finfo(self.dtype).eps: pseudoInv = True else: pseudoInv = False else: pseudoInv = self.pseudoInv if pseudoInv: #self.weights, residuals, rank, s = \ # np.linalg.lstsq(a, b) #self.weights = np.linalg.pinv(a) @ b #self.weights = sp.linalg.pinv2(a) @ b # since x1.T @ x1 is symmetric, pinvh is equivalent but faster than pinv2 self.weights = sp.linalg.pinvh(a) @ b else: #self.weights = sp.linalg.solve(a, b, sym_pos=True) self.weights = np.linalg.solve(a, b)
def evalRecs(self, x, context=None, returnContext=False): x = util.segmat(x) x1 = util.bias(x) nSeg = x1.shape[0] nObs = x1.shape[1] nIn1 = x1.shape[2] r = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype) if context is None: context = np.zeros((nSeg, self.nHidden), dtype=self.dtype) x1c = np.empty((nSeg, nIn1 + self.nHidden), dtype=self.dtype) for t in range(nObs): x1c[:, :nIn1] = x1[:, t] x1c[:, nIn1:] = context r[:, t] = self.phi(x1c.dot(self.hw)) context[...] = r[:, t] if returnContext: return r, context else: return r
def gradient(self, x, g, returnError=True): x = np.asarray(x) g = np.asarray(g) probs = self.probs(x) delta = (probs - g) / probs.size penMask = np.ones_like(self.weights) penMask[-1, :] = 0.0 grad = ( util.bias(x).T.dot(delta) + self.elastic * 2.0 * self.penalty * penMask * self.weights / self.weights.size + # L2-norm penalty (1.0 - self.elastic) * self.penalty * penMask * np.sign(self.weights) / self.weights.size) # L1-norm penalty gf = grad.ravel() if returnError: pf = self.weights[:-1, :].ravel() err = ( -np.mean(g * np.log(util.capZero(probs))) + self.elastic * self.penalty * pf.dot(pf) / pf.size + # L2-norm penalty (1.0 - self.elastic) * self.penalty * np.mean(np.abs(pf)) ) # L1-norm penalty return err, gf else: return gf
def evalRecs(self, x, contexts=None, returnContexts=False): x = util.segmat(x) x1 = util.bias(x) nSeg = x1.shape[0] nObs = x1.shape[1] r1Prev = x1 rs = [] if contexts is None: contexts = [ np.zeros((nSeg, self.nRecHiddens[l]), dtype=self.dtype) for l in range(self.nRecLayers) ] for l in range(self.nRecLayers): nIn1 = r1Prev.shape[2] r = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) r1c = np.empty((nSeg, nIn1 + self.nRecHiddens[l]), dtype=self.dtype) context = contexts[l] for t in range(nObs): r1c[:, :nIn1] = r1Prev[:, t] r1c[:, nIn1:] = context r[:, t] = self.phi(r1c.dot(self.hws[l])) context[...] = r[:, t] r1Prev = util.bias(r) rs.append(r) if returnContexts: return rs, contexts else: return rs
def gradient(self, x, g, returnError=True): x = np.asarray(x) g = np.asarray(g) probs = self.probs(x) delta = (probs - g) / probs.size grad = util.bias(x).T.dot(delta) gf = grad.ravel() if returnError: err = -np.mean(g * np.log(util.capZero(probs))) return err, gf else: return gf
def gradient(self, x, g, returnError=True): x = np.asarray(x) g = np.asarray(g) if self.flattenOut: g = g.ravel() x1 = util.bias(x) y = x1 @ self.weights if self.flattenOut: y = y.ravel() e = util.colmat(y - g) delta = 2.0 * e / e.size penMask = np.ones_like(self.weights) penMask[-1, :] = 0.0 grad = (x1.T @ delta + self.elastic * 2.0 * self.penalty * penMask * self.weights / self.weights.size + (1.0 - self.elastic) * self.penalty * penMask * np.sign(self.weights) / self.weights.size) gf = grad.ravel() if returnError: wf = self.weights[:-1, :].ravel() error = ( np.mean(e**2) + self.elastic * self.penalty * (wf @ wf) / wf.size + # L2-norm penalty (1.0 - self.elastic) * self.penalty * np.mean(np.abs(wf)) ) # L1-norm penalty return error, gf else: return gf
def gradient(self, x, g, unrollSteps=10, returnError=True): x = util.segmat(x) g = util.segmat(g) # packed views of the hidden and visible gradient matrices pg, hg, vg = util.packedViews((self.hw.shape, self.vw.shape), dtype=self.dtype) x1 = util.bias(x) nSeg = x1.shape[0] nObs = x1.shape[1] nIn1 = x1.shape[2] h = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype) r = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype) x1c = np.empty((nSeg, nObs, nIn1 + self.nHidden), dtype=self.dtype) context = np.zeros((nSeg, self.nHidden), dtype=self.dtype) for t in range(nObs): x1c[:, t, :nIn1] = x1[:, t] x1c[:, t, nIn1:] = context h[:, t] = x1c[:, t].dot(self.hw) r[:, t] = self.phi(h[:, t]) context[...] = r[:, t] r1 = util.bias(r) y = r1.dot(self.vw) rPrime = self.phi(h, 1) # error components, ditch transient e = (y - g)[:, self.transient:] delta = np.zeros(g.shape, dtype=self.dtype) delta[:, self.transient:] = 2.0 * e / e.size # visible layer gradient r1f = r1.reshape((-1, r1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = r1f.T.dot(deltaf) vwDelta = delta.dot(self.vw[:-1].T) gamma = np.zeros((nSeg, unrollSteps, self.nHidden), dtype=self.dtype) #delta = np.zeros((nSeg, nObs-self.transient, self.nHidden), dtype=self.dtype) delta = np.zeros((nSeg, nObs, self.nHidden), dtype=self.dtype) ##hg[...] = 0.0 # backward pass for hidden layer, unrolled through time #for t in range(nObs-self.transient-1, 0, -1): for t in range(nObs - 1, 0, -1): rPrimet = rPrime[:, t][:, None, :] #x1ct = x1c[:,t][:,None,:] ##x1ct = x1c[:,t] beta = gamma[:, :-1] beta = beta.dot(self.rw.T) gamma[:, 0] = vwDelta[:, t] gamma[:, 1:] = beta gamma *= rPrimet ##x1ctf = np.tile(x1ct, unrollSteps).reshape((-1, x1ct.shape[-1])) ##gammaf = gamma.reshape((-1, gamma.shape[-1])) delta[:, t] = gamma.sum(axis=1) #hg += x1ctf.T.dot(gammaf) ##hg += x1ct.T.dot(gamma.sum(axis=1)) ##hg += x1ct.T.dot(gamma.swapaxes(0,1)).sum(axis=1) x1cf = x1c.reshape((-1, x1c.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) #hg[...] = x1c.reshape((-1, x1c.shape[-1])).T.dot(delta.reshape((-1, d.shape[-1]))) hg[...] = x1cf.T.dot(deltaf) if returnError: return np.mean(e**2), pg else: return pg
def gradient(self, x, g, returnError=True): x = util.segmat(x) g = util.colmat(g) # packed views of the hidden and visible gradient matrices views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] if self.nHidden is None: cgs = views[1:-1] hg = None vg = views[-1] else: cgs = views[1:-2] hg = views[-2] vg = views[-1] # forward pass c = x c1s = [] cPrimes = [] for l, cw in enumerate(self.cws): width = self.convWidths[l] phi = self.transFunc[l] c = util.timeEmbed(c, lags=width - 1, axis=1) c1 = util.bias(c) c1s.append(c1) h = util.segdot(c1, cw) cPrime = phi(h, 1) cPrimes.append(cPrime) c = phi(h) c1 = util.bias(c) # evaluate hidden and visible layers if self.nHidden is None: y = util.segdot(c1, self.vw) else: h = util.segdot(c1, self.hw) z1 = util.bias(self.transFunc[-1](h)) zPrime = self.transFunc[-1](h, 1) y = util.segdot(z1, self.vw) # error components trim = (g.shape[1] - y.shape[1]) // 2 gTrim = g[:, :(g.shape[1] - trim)] gTrim = gTrim[:, -y.shape[1]:] # error components e = util.colmat(y - gTrim) delta = 2.0 * e / e.size if self.nHidden is None: # visible layer gradient c1f = c1.reshape((-1, c1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = c1f.T.dot(deltaf) vg += self.penaltyGradient(-1) delta = util.segdot(delta, self.vw[:-1].T) else: # visible layer gradient z1f = z1.reshape((-1, z1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = z1f.T.dot(deltaf) vg += self.penaltyGradient(-1) # hidden layer gradient c1f = c1.reshape((-1, c1.shape[-1])) delta = util.segdot(delta, self.vw[:-1].T) * zPrime deltaf = delta.reshape((-1, delta.shape[-1])) hg[...] = c1f.T.dot(deltaf) hg += self.penaltyGradient(-2) delta = util.segdot(delta, self.hw[:-1].T) # backward pass for convolutional layers for l in range(self.nConvLayers - 1, -1, -1): c1 = c1s[l] cPrime = cPrimes[l] delta = delta[:, :cPrime.shape[1]] * cPrime c1f = c1.reshape((-1, c1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) cgs[l][...] = c1f.T.dot(deltaf) cgs[l] += self.penaltyGradient(l) if l > 0: # won't propigate back to inputs delta = util.segdot(delta, self.cws[l][:-1].T) delta = deltaDeEmbedSum(delta, self.convWidths[l]) if returnError: error = np.mean(e**2) + self.penaltyError() return error, pg else: return pg
def gradient(self, x, g, unrollSteps=10, returnError=True): x = util.segmat(x) g = util.segmat(g) if isinstance(unrollSteps, (int, )): unrollSteps = [ unrollSteps, ] * self.nRecLayers views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] hgs = views[1:-1] vg = views[-1] x1 = util.bias(x) nSeg = x1.shape[0] nObs = x1.shape[1] r1Prev = x1 r1cs = [] rPrimes = [] for l in range(self.nRecLayers): nIn1 = r1Prev.shape[2] r = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) h = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) r1c = np.empty((nSeg, nObs, nIn1 + self.nRecHiddens[l]), dtype=self.dtype) context = np.zeros((nSeg, self.nRecHiddens[l]), dtype=self.dtype) for t in range(nObs): r1c[:, t, :nIn1] = r1Prev[:, t] r1c[:, t, nIn1:] = context h[:, t] = r1c[:, t].dot(self.hws[l]) r[:, t] = self.phi(h[:, t]) context[...] = r[:, t] r1Prev = util.bias(r) r1cs.append(r1c) rPrime = self.phi(h, 1) rPrimes.append(rPrime) # evaluate visible layer r1 = r1Prev y = r1.dot(self.vw) # error components, ditch transient e = (y - g)[:, self.transient:] delta = np.zeros(g.shape, dtype=self.dtype) delta[:, self.transient:] = 2.0 * e / e.size # visible layer gradient r1f = r1.reshape((-1, r1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = r1f.T.dot(deltaf) # backward pass through each layer w = self.vw for l in range(self.nRecLayers - 1, -1, -1): r1c = r1cs[l] rwsTrans = self.rws[l].T rPrime = rPrimes[l] deltaPrev = delta.dot(w[:-1].T) gamma = np.zeros((nSeg, unrollSteps[l], self.nRecHiddens[l]), dtype=self.dtype) #delta = np.zeros((nSeg, nObs-self.transient, self.nRecHiddens[l]), dtype=self.dtype) delta = np.zeros((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) # unrolled through time #for t in range(nObs-self.transient-1, 0, -1): for t in range(nObs - 1, 0, -1): rPrimet = rPrime[:, t][:, None, :] beta = gamma[:, :-1] beta = beta.dot(rwsTrans) gamma[:, 0] = deltaPrev[:, t] gamma[:, 1:] = beta gamma *= rPrimet delta[:, t] = gamma.sum(axis=1) r1cf = r1c.reshape((-1, r1c.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) hgs[l][...] = r1cf.T.dot(deltaf) #print('hg %d: %f' % (l, np.sqrt(np.mean(hgs[l]**2)))) w = self.iws[l] if returnError: return np.mean(e**2), pg else: return pg
def gradient(self, x, g, returnError=True): """Compute the gradient of the mean-squared error with respect to the network weights for each layer and given inputs and targets. Useful for optimization routines that make use of first-order gradients. Args: x: g: returnError: If True (default) then also return the mean-squared error. This can improve performance in some optimization routines by avoiding an additional forward pass. Returns: If returnError is True, then return a tuple containing the error followed by a 1d numpy array containing the gradient of the packed weights. If returnError is False, then only return the gradient. """ x = np.asarray(x) g = np.asarray(g) # packed views of the hidden and visible gradient matrices views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] hgs = views[1:-1] vg = views[-1] # forward pass z1 = util.bias(x) z1s = [z1] zPrimes = [] for hw, phi in zip(self.hws, self.transFunc): h = z1.dot(hw) z1 = util.bias(phi(h)) z1s.append(z1) zPrime = phi(h, 1) zPrimes.append(zPrime) v = z1.dot(self.vw) probs = util.softmax(v) # error components delta = util.colmat(probs - g) / probs.size # visible layer gradient vg[...] = z1.T.dot(delta) vg += self.penaltyGradient(-1) # backward pass for hidden layers w = self.vw for l in range(self.nHLayers-1, -1, -1): delta = delta.dot(w[:-1,:].T) * zPrimes[l] hgs[l][...] = z1s[l].T.dot(delta) hgs[l] += self.penaltyGradient(l) w = self.hws[l] if returnError: error = -np.mean(g*np.log(util.capZero(probs))) + self.penaltyError() return error, pg else: return pg