def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient
def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = (reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0)) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient
def numpy_err_input_update(self): """Backpropagate error (will compute err_input). """ if not self.need_err_input: return from scipy.signal import convolve2d self.err_input.map_invalidate() self.err_output.map_read() self.weights.map_read() batch_size = self.input.mem.shape[0] sy = self.input.mem.shape[1] sx = self.input.mem.shape[2] n_channels = self.input.mem.size // (batch_size * sx * sy) sx_full = self.padding[0] + sx + self.padding[2] sy_full = self.padding[1] + sy + self.padding[3] weights = (reshape_transposed(self.weights.mem) if self.weights_transposed else self.weights.mem) if not self.err_input_beta: self.err_input.mem[:] = 0 else: self.err_input.mem *= self.err_input_beta err_input = numpy.zeros_like(self.err_input.mem) # initialize sparse output error sparse_err_output = numpy.zeros( (batch_size, sy_full - self.ky + 1, sx_full - self.kx + 1, self.n_kernels), dtype=self.err_output.dtype) for (batch, i, j, k), err in numpy.ndenumerate(self.err_output.mem): sparse_err_output[batch, i * self.sliding[1], j * self.sliding[0], k] = err err_sample = numpy.zeros( (sy_full - self.ky + 1, sx_full - self.kx + 1), dtype=err_input.dtype) for batch, k in product(range(batch_size), range(self.n_kernels)): err_sample[:] = sparse_err_output[batch, :, :, k] cur_kernel = weights[k].reshape(self.ky, self.kx, n_channels) for ch in range(n_channels): err_input_full = convolve2d(err_sample, cur_kernel[:, :, ch], mode='full') err_input[batch, :, :, ch] += \ err_input_full[self.padding[1]:(sy_full - self.padding[3]), self.padding[0]:(sx_full - self.padding[2])] self.err_input.mem += err_input * self.err_input_alpha
def numpy_err_input_update(self): """Backpropagate error (will compute err_input). """ if not self.need_err_input: return from scipy.signal import convolve2d self.err_input.map_invalidate() self.err_output.map_read() self.weights.map_read() batch_size = self.input.mem.shape[0] sy = self.input.mem.shape[1] sx = self.input.mem.shape[2] n_channels = self.input.mem.size // (batch_size * sx * sy) sx_full = self.padding[0] + sx + self.padding[2] sy_full = self.padding[1] + sy + self.padding[3] weights = (reshape_transposed(self.weights.mem) if self.weights_transposed else self.weights.mem) if not self.err_input_beta: self.err_input.mem[:] = 0 else: self.err_input.mem *= self.err_input_beta err_input = numpy.zeros_like(self.err_input.mem) # initialize sparse output error sparse_err_output = numpy.zeros(( batch_size, sy_full - self.ky + 1, sx_full - self.kx + 1, self.n_kernels), dtype=self.err_output.dtype) for (batch, i, j, k), err in numpy.ndenumerate(self.err_output.mem): sparse_err_output[batch, i * self.sliding[1], j * self.sliding[0], k] = err err_sample = numpy.zeros( (sy_full - self.ky + 1, sx_full - self.kx + 1), dtype=err_input.dtype) for batch, k in product(range(batch_size), range(self.n_kernels)): err_sample[:] = sparse_err_output[batch, :, :, k] cur_kernel = weights[k].reshape(self.ky, self.kx, n_channels) for ch in range(n_channels): err_input_full = convolve2d(err_sample, cur_kernel[:, :, ch], mode='full') err_input[batch, :, :, ch] += \ err_input_full[self.padding[1]:(sy_full - self.padding[3]), self.padding[0]:(sx_full - self.padding[2])] self.err_input.mem += err_input * self.err_input_alpha
def numpy_run(self): """Forward propagation from batch on CPU only. """ self.input.map_read() self.weights.map_read() self.bias.map_read() self.output.map_invalidate() sx_full = self.padding[0] + self._sx + self.padding[2] sy_full = self.padding[1] + self._sy + self.padding[3] nx = (sx_full - self.kx) // self.sliding[0] + 1 ny = (sy_full - self.ky) // self.sliding[1] + 1 weights = (reshape_transposed(self.weights.mem) if self.weights_transposed else self.weights.mem) assert self.kx >= 0 and self.ky >= 0 for batch, _ in ((batch, ch) for batch in range(self._batch_size) for ch in range(self._n_channels)): for k, kernel in enumerate(weights): for i, j in ((i, j) for i in range(ny) for j in range(nx)): full_i1 = i * self.sliding[1] full_i2 = full_i1 + self.ky full_j1 = j * self.sliding[0] full_j2 = full_j1 + self.kx in_i1 = min(max(full_i1 - self.padding[1], 0), self._sy) in_i2 = min(max(full_i2 - self.padding[1], 0), self._sy) in_j1 = min(max(full_j1 - self.padding[0], 0), self._sx) in_j2 = min(max(full_j2 - self.padding[0], 0), self._sx) cut_i1, cut_i2 = (in_i1 - full_i1 + self.padding[1], in_i2 - full_i1 + self.padding[1]) cut_j1, cut_j2 = (in_j1 - full_j1 + self.padding[0], in_j2 - full_j1 + self.padding[0]) if in_i2 - in_i1 > 0 or in_j2 - in_j1 > 0: cut = self.input.mem[batch, in_i1:in_i2, in_j1:in_j2] kernel_3d = kernel.reshape(self.ky, self.kx, self._n_channels) cutted_kernel = kernel_3d[cut_i1:cut_i2, cut_j1:cut_j2, :] assert cut.size == cutted_kernel.size conv = numpy.sum(numpy.multiply(cut.ravel(), cutted_kernel.ravel())) self.output.mem[batch, i, j, k] = conv # add bias and apply activation function self.apply_activation()
def numpy_run(self): """Forward propagation from batch on CPU only. """ self.input.map_read() self.weights.map_read() self.bias.map_read() self.output.map_invalidate() sx_full = self.padding[0] + self._sx + self.padding[2] sy_full = self.padding[1] + self._sy + self.padding[3] nx = (sx_full - self.kx) // self.sliding[0] + 1 ny = (sy_full - self.ky) // self.sliding[1] + 1 weights = (reshape_transposed(self.weights.mem) if self.weights_transposed else self.weights.mem) assert self.kx >= 0 and self.ky >= 0 for batch, _ in ((batch, ch) for batch in range(self._batch_size) for ch in range(self._n_channels)): for k, kernel in enumerate(weights): for i, j in ((i, j) for i in range(ny) for j in range(nx)): full_i1 = i * self.sliding[1] full_i2 = full_i1 + self.ky full_j1 = j * self.sliding[0] full_j2 = full_j1 + self.kx in_i1 = min(max(full_i1 - self.padding[1], 0), self._sy) in_i2 = min(max(full_i2 - self.padding[1], 0), self._sy) in_j1 = min(max(full_j1 - self.padding[0], 0), self._sx) in_j2 = min(max(full_j2 - self.padding[0], 0), self._sx) cut_i1, cut_i2 = (in_i1 - full_i1 + self.padding[1], in_i2 - full_i1 + self.padding[1]) cut_j1, cut_j2 = (in_j1 - full_j1 + self.padding[0], in_j2 - full_j1 + self.padding[0]) if in_i2 - in_i1 > 0 or in_j2 - in_j1 > 0: cut = self.input.mem[batch, in_i1:in_i2, in_j1:in_j2] kernel_3d = kernel.reshape(self.ky, self.kx, self._n_channels) cutted_kernel = kernel_3d[cut_i1:cut_i2, cut_j1:cut_j2, :] assert cut.size == cutted_kernel.size conv = numpy.sum( numpy.multiply(cut.ravel(), cutted_kernel.ravel())) self.output.mem[batch, i, j, k] = conv # add bias and apply activation function self.apply_activation()
def numpy_weights_update(self): self.input.map_read() self.err_output.map_read() self.weights.map_write() self.gradient_weights.map_write() self.accumulated_gradient_weights.map_write() dtype = self.weights.dtype sy = self.input.shape[1] sx = self.input.shape[2] n_channels = self.input.size // (self.input.shape[0] * sx * sy) sx_full = self.padding[0] + sx + self.padding[2] sy_full = self.padding[1] + sy + self.padding[3] nx = (sx_full - self.kx) // self.sliding[0] + 1 ny = (sy_full - self.ky) // self.sliding[1] + 1 sample_shape = (nx * ny, self.kx * self.ky * n_channels) sh = self.err_output.shape if len(sh) == 3: sh[1] *= sh[2] sh[2] = 1 # calculate gradient for weights gd_weights = (reshape_transposed(self.gradient_weights.mem) if self.weights_transposed else self.gradient_weights.mem) gd_weights[:] = 0 cut = numpy.empty((self.ky, self.kx, n_channels), dtype=dtype) sample = numpy.empty(sample_shape, dtype=dtype) for batch in range(self.current_batch_size): # input data unrolling sample = numpy.empty(sample_shape) for by, bx in ((by, bx) for by in range(ny) for bx in range(nx)): y1, y2 = (by * self.sliding[1], by * self.sliding[1] + self.ky) x1, x2 = (bx * self.sliding[0], bx * self.sliding[0] + self.kx) i1, i2 = (min(max(y1 - self.padding[1], 0), sy), min(max(y2 - self.padding[1], 0), sy)) j1, j2 = (min(max(x1 - self.padding[0], 0), sx), min(max(x2 - self.padding[0], 0), sx)) cut_i1, cut_i2 = (i1 - y1 + self.padding[1], i2 - y1 + self.padding[1]) cut_j1, cut_j2 = (j1 - x1 + self.padding[0], j2 - x1 + self.padding[0]) cut = numpy.zeros((self.ky, self.kx, n_channels), dtype=self.input.mem.dtype) cut[cut_i1:cut_i2, cut_j1:cut_j2, :] = \ self.input.mem[batch, i1:i2, j1:j2, :].reshape(i2 - i1, j2 - j1, n_channels) sample[by * nx + bx] = cut.ravel() err_out_shape = self.err_output.mem.shape out = self.err_output.mem[batch].reshape(err_out_shape[1] * err_out_shape[2], self.n_kernels) gd_weights += numpy.dot(out.transpose(), sample) if self.weights_transposed: gd_weights = reshape_transposed(gd_weights) # update weights lr = self.learning_rate factor_l12 = self.weights_decay l1_vs_l2 = self.l1_vs_l2 gradient = -nn_units.GradientDescentBase.numpy_gradient_step( self.weights.mem, gd_weights, lr, factor_l12, l1_vs_l2, self.factor_ortho, self.weights_transposed) if self.accumulate_gradient == self.OP_NONE: pass elif self.accumulate_gradient == self.OP_STORE: self.accumulated_gradient_weights.mem[:] = gradient elif self.accumulate_gradient == self.OP_ADD: self.accumulated_gradient_weights.mem[:] += gradient elif self.accumulate_gradient == self.OP_FLUSH: gradient += self.accumulated_gradient_weights.mem self.accumulated_gradient_weights.mem[:] = 0 else: raise ValueError("Incorrect accumulate_gradient attribute value") if self.gradient_weights_with_moment: gradient += (self.gradient_weights_with_moment.mem * self.gradient_moment) self.gradient_weights.mem[:] = gradient[:] if self.apply_gradient: self.weights.mem += gradient
def numpy_weights_update(self): self.input.map_read() self.err_output.map_read() self.weights.map_write() self.gradient_weights.map_write() self.accumulated_gradient_weights.map_write() dtype = self.weights.dtype sy = self.input.shape[1] sx = self.input.shape[2] n_channels = self.input.size // (self.input.shape[0] * sx * sy) sx_full = self.padding[0] + sx + self.padding[2] sy_full = self.padding[1] + sy + self.padding[3] nx = (sx_full - self.kx) // self.sliding[0] + 1 ny = (sy_full - self.ky) // self.sliding[1] + 1 sample_shape = (nx * ny, self.kx * self.ky * n_channels) sh = self.err_output.shape if len(sh) == 3: sh[1] *= sh[2] sh[2] = 1 # calculate gradient for weights gd_weights = (reshape_transposed(self.gradient_weights.mem) if self.weights_transposed else self.gradient_weights.mem) gd_weights[:] = 0 cut = numpy.empty((self.ky, self.kx, n_channels), dtype=dtype) sample = numpy.empty(sample_shape, dtype=dtype) for batch in range(self.current_batch_size): # input data unrolling sample = numpy.empty(sample_shape) for by, bx in ((by, bx) for by in range(ny) for bx in range(nx)): y1, y2 = (by * self.sliding[1], by * self.sliding[1] + self.ky) x1, x2 = (bx * self.sliding[0], bx * self.sliding[0] + self.kx) i1, i2 = (min(max(y1 - self.padding[1], 0), sy), min(max(y2 - self.padding[1], 0), sy)) j1, j2 = (min(max(x1 - self.padding[0], 0), sx), min(max(x2 - self.padding[0], 0), sx)) cut_i1, cut_i2 = (i1 - y1 + self.padding[1], i2 - y1 + self.padding[1]) cut_j1, cut_j2 = (j1 - x1 + self.padding[0], j2 - x1 + self.padding[0]) cut = numpy.zeros((self.ky, self.kx, n_channels), dtype=self.input.mem.dtype) cut[cut_i1:cut_i2, cut_j1:cut_j2, :] = \ self.input.mem[batch, i1:i2, j1:j2, :].reshape(i2 - i1, j2 - j1, n_channels) sample[by * nx + bx] = cut.ravel() err_out_shape = self.err_output.mem.shape out = self.err_output.mem[batch].reshape( err_out_shape[1] * err_out_shape[2], self.n_kernels) gd_weights += numpy.dot(out.transpose(), sample) if self.weights_transposed: gd_weights = reshape_transposed(gd_weights) # update weights lr = self.learning_rate factor_l12 = self.weights_decay l1_vs_l2 = self.l1_vs_l2 gradient = -nn_units.GradientDescentBase.numpy_gradient_step( self.weights.mem, gd_weights, lr, factor_l12, l1_vs_l2, self.factor_ortho, self.weights_transposed) if self.accumulate_gradient: self.accumulate_gradient_f(self.accumulated_gradient_weights.mem, gradient) if self.gradient_weights_with_moment: gradient += (self.gradient_weights_with_moment.mem * self.gradient_moment) self.gradient_weights.mem[:] = gradient[:] if self.apply_gradient: self.weights.mem += gradient