def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default targets = buffers.inputs.targets probabilities = buffers.outputs.probabilities loss = buffers.outputs.loss # reshape flat_inputs = flatten_all_but_last(inputs) flat_probs = flatten_all_but_last(probabilities) flat_loss = flatten_all_but_last(loss) flat_targets = flatten_all_but_last(targets) # softmax _h.softmax_m(flat_inputs, flat_probs) # the multinomial cross entropy error is given by # - sum over i: p_i * ln(y_i) # now our targets are indices so all p_i = 0 except for i=t _h.fill(loss, 0.) _h.index_m_by_v(flat_probs, flat_targets, flat_loss) _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss) _h.log_t(loss, loss) _h.mult_st(-1, loss, loss)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default targets = buffers.inputs.targets predictions = buffers.outputs.predictions loss = buffers.outputs.loss # reshape flat_inputs = flatten_all_but_last(inputs) flat_probs = flatten_all_but_last(predictions) flat_loss = flatten_all_but_last(loss) flat_targets = flatten_all_but_last(targets) # softmax _h.softmax_m(flat_inputs, flat_probs) # the multinomial cross entropy error is given by # - sum over i: p_i * ln(y_i) # now our targets are indices so all p_i = 0 except for i=t _h.fill(loss, 0.) _h.index_m_by_v(flat_probs, flat_targets, flat_loss) _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss) _h.log_t(loss, loss) _h.mult_st(-1, loss, loss)
def backward_pass(self, buffers): # prepare _h = self.handler probs = buffers.outputs.predictions dinputs = buffers.input_deltas.default dloss = buffers.output_deltas.loss temp_dinputs = buffers.internals.temp_dinputs softmax_deriv = buffers.internals.softmax_deriv # reshape flat_probs = flatten_all_but_last(probs) flat_softmax_deriv = flatten_all_but_last(softmax_deriv) flat_dloss = flatten_all_but_last(dloss) flat_dinputs = flatten_all_but_last(dinputs) flat_temp_dinputs = flatten_all_but_last(temp_dinputs) # general softmax derivative _h.softmax_deriv_m(flat_probs,flat_temp_dinputs,flat_softmax_deriv) # Multiply with sequencewise loss. # Multiplication requires "manual broadcasting" so that it works with the PyCuda handler. for time in range(softmax_deriv.shape[0]): sub_softmax_deriv = softmax_deriv[time,:,:] _h.mult_mv(sub_softmax_deriv, flat_dloss, sub_softmax_deriv) _h.add_tt(flat_softmax_deriv, flat_dinputs, flat_dinputs)
def forward_pass(self, buffers, training_pass=True): _h = self.handler sigma_b, centered, x_hat = buffers.internals gamma, beta, mu, sigma = buffers.parameters # Note: we flatten time for all buffers, so we skip the flat_ prefix inputs = flatten_all_but_last(buffers.inputs.default) centered = flatten_all_but_last(centered) x_hat = flatten_all_but_last(x_hat) out = flatten_all_but_last(buffers.outputs.default) m = inputs.shape[0] if training_pass: mu_b = sigma_b # temporary use this with other name # Calculate the (negative) batch mean _h.sum_t(inputs, 0, mu_b) _h.mult_st(-1.0 / m, mu_b, mu_b) # Adjust mu as an exponential moving average # TODO: Find better way _h.mult_st(self.decay, mu, mu) _h.mult_add_st(1.0 - self.decay, mu_b, mu) mu = mu_b # Calculate the centered activations _h.add_mv(inputs, mu.reshape((1, mu.size)), centered) if training_pass: sigma2 = sigma_b # temporary use this with other name centered2 = x_hat # temporary use this with other name # Calculate the variance _h.mult_tt(centered, centered, centered2) _h.sum_t(centered2, 0, sigma2) _h.mult_st(1.0 / m, sigma2, sigma2) # TODO m-1 instead? _h.add_st(self.epsilon, sigma2, sigma2) # (numerically stabilized) # Standard deviation _h.sqrt_t(sigma2, sigma_b) # Adjust sigma as an exponential moving sigma # FIXME: This is clearly a hack and wrong _h.mult_st(self.decay, sigma, sigma) _h.mult_add_st(1.0 - self.decay, sigma_b, sigma) sigma = sigma_b # compute normalized inputs _h.divide_mv(centered, sigma.reshape((1, sigma.size)), x_hat) # Compute outputs _h.mult_mv(x_hat, gamma.reshape((1, gamma.size)), out) _h.add_mv(out, beta.reshape((1, beta.size)), out)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default labels = buffers.inputs.labels if 'mask' in buffers.inputs.keys(): mask = buffers.inputs.mask else: mask = None predictions = buffers.outputs.predictions loss = buffers.outputs.loss temp_dinputs = buffers.internals.temp_dinputs # reshape flat_inputs = flatten_all_but_last(inputs) flat_probs = flatten_all_but_last(predictions) # softmax _h.softmax_m(flat_inputs, flat_probs) # At this point, softmax is computed, and CTC code begins. # The variable predictions has already been correctly filled # Here, we compute the loss, and we save the deltas to temp_dinputs # for being used in the backward pass. # All this is performed sequence-wise and currently does not parallelize. for sequence in xrange(inputs.shape[1]): if mask is not None: this_mask = mask[:,sequence,0].astype(int) # TODO: astype OK? mask_zero_index = _h.get_final_zeros_index_v(this_mask) # these_predictions = predictions[this_mask,sequence,:] these_predictions = predictions[0:mask_zero_index,sequence,:] else: these_predictions = predictions[:,sequence,:] these_uncut_labels = labels[:,sequence,0].astype(np.int64) final_zero_index = _h.get_final_zeros_index_v(these_uncut_labels) these_cut_labels = these_uncut_labels[0:final_zero_index] these_deltas = _h.allocate(these_predictions.shape) this_error = _h.calculate_ctc(these_predictions,these_cut_labels,these_deltas) _h.mult_st(-1, these_deltas, these_deltas) # fold "minus one" into calculate_ctc? # TODO annoying: single float no work on GPU loss[sequence,0] = np.array(this_error,dtype=loss.dtype) if mask is not None: # temp_dinputs[this_mask.astype(bool),sequence,:] = these_deltas temp_dinputs[0:mask_zero_index,sequence,:] = these_deltas else: temp_dinputs[:,sequence,:] = these_deltas
def conv2d_forward_batch(self, inputs, params, bias, outputs, padding, stride): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = outputs.shape[1] * outputs.shape[2] num_kernel_params = np.prod(kernel_shape) out_shape = (num_output_pixels, num_filters) num_cuda_kernels = num_output_pixels * num_input_maps for i in range(num_images): col = self.zeros((num_output_pixels, num_kernel_params)) _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(outputs.shape[2]), np.int32(num_input_maps), col.gpudata, block=(NUM_CUDA_THREADS, 1, 1), grid=(get_blocks(num_cuda_kernels), 1)) reshaped_params = params.reshape(num_filters, num_kernel_params) culinalg.dot(col, reshaped_params, transb='T', out=outputs[i].reshape(out_shape)) flat_outputs = flatten_all_but_last(outputs) self.add_mv(flat_outputs, bias, flat_outputs)
def conv2d_forward_batch(self, inputs, params, bias, outputs, padding, stride): num_filters = params.shape[0] num_images, input_rows, input_cols, num_input_maps = inputs.shape kernel_shape = params.shape[1:] num_output_pixels = outputs.shape[1] * outputs.shape[2] num_kernel_params = np.prod(kernel_shape) out_shape = (num_output_pixels, num_filters) num_cuda_kernels = num_output_pixels * num_input_maps for i in range(num_images): col = self.zeros((num_output_pixels, num_kernel_params)) _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i], np.int32(input_rows), np.int32(input_cols), np.int32(kernel_shape[0]), np.int32(kernel_shape[1]), np.int32(padding), np.int32(padding), np.int32(stride[0]), np.int32(stride[1]), np.int32(outputs.shape[2]), np.int32(num_input_maps), col.gpudata, block=(get_blocks(num_cuda_kernels), 1, 1), grid=(NUM_CUDA_THREADS, 1, 1)) reshaped_params = params.reshape(num_filters, num_kernel_params) culinalg.dot(col, reshaped_params, transb='T', out=outputs[i].reshape(out_shape)) flat_outputs = flatten_all_but_last(outputs) self.add_mv(flat_outputs, bias, flat_outputs)
def backward_pass(self, buffers): _h = self.handler sigma_b, centered, x_hat = buffers.internals gamma = buffers.parameters.gamma dgamma = buffers.gradients.gamma dbeta = buffers.gradients.beta # Note: we flatten time for all buffers, so we skip the flat_ prefix x_hat = flatten_all_but_last(x_hat) outdeltas = flatten_all_but_last(buffers.output_deltas.default) indeltas = flatten_all_but_last(buffers.input_deltas.default) m = outdeltas.shape[0] big_tmp = _h.allocate(x_hat.shape) # big small_tmp = _h.allocate(gamma.shape) # small # ------------- Gradients --------------- # Calculate dgamma tmp = big_tmp dgamma_tmp = small_tmp _h.mult_tt(outdeltas, x_hat, tmp) _h.sum_t(tmp, axis=0, out=dgamma_tmp) _h.add_tt(dgamma_tmp, dgamma, dgamma) _h.mult_st(1 / m, dgamma_tmp, dgamma_tmp) term1 = big_tmp _h.mult_mv(x_hat, dgamma_tmp.reshape((1, gamma.size)), term1) # Calculate dbeta dbeta_tmp = small_tmp _h.sum_t(outdeltas, axis=0, out=dbeta_tmp) _h.add_tt(dbeta_tmp, dbeta, dbeta) _h.mult_st(1 / m, dbeta_tmp, dbeta_tmp) # ------------- Deltas --------------- term2 = big_tmp term3 = big_tmp _h.subtract_tt(outdeltas, term1, term2) _h.subtract_mv(term2, dbeta_tmp.reshape((1, dbeta.size)), term3) # get normalization factor (gamma / sigma_b) coeff = small_tmp _h.divide_tt(gamma, sigma_b, coeff) term4 = big_tmp _h.mult_mv(term3, coeff.reshape((1, coeff.size)), term4) _h.add_tt(term4, indeltas, indeltas)
def backward_pass(self, buffers): # prepare _h = self.handler targets = buffers.inputs.targets probs = buffers.outputs.probabilities dinputs = buffers.input_deltas.default dloss = buffers.output_deltas.loss t_bin = buffers.internals.t_bin # reshape flat_probs = flatten_all_but_last(probs) flat_targets = flatten_all_but_last(targets) flat_t_bin = flatten_all_but_last(t_bin) flat_dloss = flatten_all_but_last(dloss) flat_dinputs = flatten_all_but_last(dinputs) # derivative of multinomial cross-entropy error wrt softmax: # y - t _h.binarize_v(flat_targets, flat_t_bin) _h.mult_st(-1, flat_t_bin, flat_t_bin) _h.add_tt(flat_t_bin, flat_probs, flat_t_bin) _h.mult_mv(flat_t_bin, flat_dloss, flat_t_bin) _h.add_tt(flat_t_bin, flat_dinputs, flat_dinputs)
def forward_pass(self, buffers, training_pass=True): # prepare _h = self.handler inputs = buffers.inputs.default targets = buffers.inputs.targets predictions = buffers.outputs.predictions loss = buffers.outputs.loss # reshape flat_inputs = flatten_all_but_last(inputs) flat_probs = flatten_all_but_last(predictions) flat_loss = flatten_all_but_last(loss) flat_targets = flatten_all_but_last(targets) # softmax _h.softmax_m(flat_inputs, flat_probs) # the multinomial cross entropy error is given by # - sum over i: p_i * ln(y_i) _h.copy_to(flat_probs, flat_loss) _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss) _h.log_t(flat_loss, flat_loss) _h.mult_tt(flat_loss, flat_targets, flat_loss) _h.mult_st(-1, loss, loss)
def backward_pass(self, buffers): # prepare _h = self.handler targets = buffers.inputs.targets probs = buffers.outputs.predictions dinputs = buffers.input_deltas.default dloss = buffers.output_deltas.loss t_bin = buffers.internals.t_bin # reshape flat_probs = flatten_all_but_last(probs) flat_targets = flatten_all_but_last(targets) flat_t_bin = flatten_all_but_last(t_bin) flat_dloss = flatten_all_but_last(dloss) flat_dinputs = flatten_all_but_last(dinputs) # derivative of multinomial cross-entropy error wrt softmax: # y - t _h.binarize_v(flat_targets, flat_t_bin) _h.mult_st(-1, flat_t_bin, flat_t_bin) _h.add_tt(flat_t_bin, flat_probs, flat_t_bin) _h.mult_mv(flat_t_bin, flat_dloss, flat_t_bin) _h.add_tt(flat_t_bin, flat_dinputs, flat_dinputs)