def bprop(self, y_grad, h_grad): n = self.n_hidden h_grad = h_grad + y_grad c_grad = h_grad * self._tmp_u u_grad = h_grad * (self._tmp_c - self._tmp_h_tm1) h_grad *= 1 - self._tmp_u c_grad = ca.ascontiguousarray(ca.transpose(c_grad)) u_grad = ca.ascontiguousarray(ca.transpose(u_grad)) c_grad = self.act_c.bprop(c_grad) ca.sum(c_grad, axis=1, keepdims=True, out=self.b_c.grad_array) u_grad = self.act_u.bprop(u_grad) ca.sum(u_grad, axis=1, keepdims=True, out=self.b_u.grad_array) r_grad = c_grad * self._tmp_h_c r_grad = self.act_r.bprop(r_grad) ca.sum(r_grad, axis=1, keepdims=True, out=self.b_r.grad_array) stack_grad = ca.empty((self.n_hidden * 3, y_grad.shape[0])) stack_grad[:n, :] = r_grad stack_grad[n : n * 2, :] = u_grad stack_grad[n * 2 : n * 3, :] = c_grad ca.dot(self._tmp_x.T, stack_grad.T, out=self.w_x.grad_array) x_grad = ca.dot(stack_grad.T, self.w_x.array.T) stack_grad[n * 2 : n * 3, :] *= self._tmp_r ca.dot(self._tmp_h_tm1.T, stack_grad.T, out=self.w_h.grad_array) h_grad += ca.dot(stack_grad.T, self.w_h.array.T) ca.clip(h_grad, -self.clip, self.clip, out=h_grad) return {"x_grad": x_grad, "h_grad": h_grad}
def fprop(self): pred = self.x.out target = self.target.out if self.clip: ca.clip(pred, _FLT_MIN, .9999999, pred) self.out = -ca.sum(target * ca.log(pred) + (1 - target) * ca.log(1 - pred))
def bprop(self, y_grad, h_grad): n = self.n_hidden h_grad = h_grad + y_grad c_grad = h_grad * self._tmp_u u_grad = h_grad * (self._tmp_c - self._tmp_h_tm1) h_grad *= (1 - self._tmp_u) c_grad = ca.ascontiguousarray(ca.transpose(c_grad)) u_grad = ca.ascontiguousarray(ca.transpose(u_grad)) c_grad = self.act_c.bprop(c_grad) ca.sum(c_grad, axis=1, keepdims=True, out=self.b_c.grad_array) u_grad = self.act_u.bprop(u_grad) ca.sum(u_grad, axis=1, keepdims=True, out=self.b_u.grad_array) r_grad = c_grad * self._tmp_h_c r_grad = self.act_r.bprop(r_grad) ca.sum(r_grad, axis=1, keepdims=True, out=self.b_r.grad_array) stack_grad = ca.empty((self.n_hidden*3, y_grad.shape[0])) stack_grad[:n, :] = r_grad stack_grad[n:n*2, :] = u_grad stack_grad[n*2:n*3, :] = c_grad ca.dot(self._tmp_x.T, stack_grad.T, out=self.w_x.grad_array) x_grad = ca.dot(stack_grad.T, self.w_x.array.T) stack_grad[n*2:n*3, :] *= self._tmp_r ca.dot(self._tmp_h_tm1.T, stack_grad.T, out=self.w_h.grad_array) h_grad += ca.dot(stack_grad.T, self.w_h.array.T) ca.clip(h_grad, -self.clip, self.clip, out=h_grad) return {'x_grad': x_grad, 'h_grad': h_grad}
def categorical_cross_entropy(y_pred, y_true, eps=1e-15): # Assumes one-hot encoding. y_pred = ca.clip(y_pred, eps, 1 - eps) # XXX: do we need to normalize? y_pred /= ca.sum(y_pred, axis=1, keepdims=True) loss = -ca.sum(y_true * ca.log(y_pred), axis=1) return loss
def fprop(self): ca.clip(self.x.out, self.a_min, self.a_max, out=self.out)
def fprop(self): ca.clip(self.x.array, self.a_min, self.a_max, out=self.array)
def grad(self): pred = self.x.out target = self.target.out if self.clip: ca.clip(pred, _FLT_MIN, .9999999, pred) self.x.out_grad = -(target / pred - (1 - target) / (1 - pred))
def grad(self): pred = self.x.out target = self.target.out if self.clip: ca.clip(pred, _FLT_MIN, .9999999, pred) self.x.out_grad = -(target/pred - (1-target)/(1-pred))
def fprop(self): pred = self.x.out target = self.target.out if self.clip: ca.clip(pred, _FLT_MIN, .9999999, pred) self.out = -ca.sum(target*ca.log(pred) + (1 - target)*ca.log(1 - pred))