예제 #1
0
    def forward_pass(self, buffers, training_pass=True):
        # prepare
        _h = self.handler
        inputs = buffers.inputs.default
        targets = buffers.inputs.targets
        probabilities = buffers.outputs.probabilities
        loss = buffers.outputs.loss

        # reshape
        flat_inputs = flatten_all_but_last(inputs)
        flat_probs = flatten_all_but_last(probabilities)
        flat_loss = flatten_all_but_last(loss)
        flat_targets = flatten_all_but_last(targets)

        # softmax
        _h.softmax_m(flat_inputs, flat_probs)

        # the multinomial cross entropy error is given by
        # - sum over i: p_i * ln(y_i)
        # now our targets are indices so all p_i = 0 except for i=t
        _h.fill(loss, 0.)
        _h.index_m_by_v(flat_probs, flat_targets, flat_loss)
        _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss)
        _h.log_t(loss, loss)
        _h.mult_st(-1, loss, loss)
예제 #2
0
    def forward_pass(self, buffers, training_pass=True):
        # prepare
        _h = self.handler
        inputs = buffers.inputs.default
        targets = buffers.inputs.targets
        predictions = buffers.outputs.predictions
        loss = buffers.outputs.loss

        # reshape
        flat_inputs = flatten_all_but_last(inputs)
        flat_probs = flatten_all_but_last(predictions)
        flat_loss = flatten_all_but_last(loss)
        flat_targets = flatten_all_but_last(targets)

        # softmax
        _h.softmax_m(flat_inputs, flat_probs)

        # the multinomial cross entropy error is given by
        # - sum over i: p_i * ln(y_i)
        # now our targets are indices so all p_i = 0 except for i=t
        _h.fill(loss, 0.)
        _h.index_m_by_v(flat_probs, flat_targets, flat_loss)
        _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss)
        _h.log_t(loss, loss)
        _h.mult_st(-1, loss, loss)
예제 #3
0
    def backward_pass(self, buffers):
        # prepare
        _h = self.handler
        probs = buffers.outputs.predictions

        dinputs = buffers.input_deltas.default
        dloss = buffers.output_deltas.loss
        temp_dinputs = buffers.internals.temp_dinputs
        softmax_deriv = buffers.internals.softmax_deriv

        # reshape
        flat_probs = flatten_all_but_last(probs)
        flat_softmax_deriv = flatten_all_but_last(softmax_deriv)
        flat_dloss = flatten_all_but_last(dloss)
        flat_dinputs = flatten_all_but_last(dinputs)
        flat_temp_dinputs = flatten_all_but_last(temp_dinputs)

        # general softmax derivative
        _h.softmax_deriv_m(flat_probs,flat_temp_dinputs,flat_softmax_deriv)

        # Multiply with sequencewise loss.
        # Multiplication requires "manual broadcasting" so that it works with the PyCuda handler.
        for time in range(softmax_deriv.shape[0]):
            sub_softmax_deriv = softmax_deriv[time,:,:]
            _h.mult_mv(sub_softmax_deriv, flat_dloss, sub_softmax_deriv)

        _h.add_tt(flat_softmax_deriv, flat_dinputs, flat_dinputs)
    def forward_pass(self, buffers, training_pass=True):
        _h = self.handler
        sigma_b, centered, x_hat = buffers.internals
        gamma, beta, mu, sigma = buffers.parameters
        # Note: we flatten time for all buffers, so we skip the flat_ prefix
        inputs = flatten_all_but_last(buffers.inputs.default)
        centered = flatten_all_but_last(centered)
        x_hat = flatten_all_but_last(x_hat)
        out = flatten_all_but_last(buffers.outputs.default)
        m = inputs.shape[0]

        if training_pass:
            mu_b = sigma_b  # temporary use this with other name
            # Calculate the (negative) batch mean
            _h.sum_t(inputs, 0, mu_b)
            _h.mult_st(-1.0 / m, mu_b, mu_b)

            # Adjust mu as an exponential moving average
            # TODO: Find better way
            _h.mult_st(self.decay, mu, mu)
            _h.mult_add_st(1.0 - self.decay, mu_b, mu)

            mu = mu_b

        # Calculate the centered activations
        _h.add_mv(inputs, mu.reshape((1, mu.size)), centered)

        if training_pass:
            sigma2 = sigma_b        # temporary use this with other name
            centered2 = x_hat  # temporary use this with other name
            # Calculate the variance
            _h.mult_tt(centered, centered, centered2)
            _h.sum_t(centered2, 0, sigma2)
            _h.mult_st(1.0 / m, sigma2, sigma2)  # TODO m-1 instead?
            _h.add_st(self.epsilon, sigma2, sigma2)  # (numerically stabilized)

            # Standard deviation
            _h.sqrt_t(sigma2, sigma_b)

            # Adjust sigma as an exponential moving sigma
            # FIXME: This is clearly a hack and wrong
            _h.mult_st(self.decay, sigma, sigma)
            _h.mult_add_st(1.0 - self.decay, sigma_b, sigma)

            sigma = sigma_b

        # compute normalized inputs
        _h.divide_mv(centered, sigma.reshape((1, sigma.size)), x_hat)

        # Compute outputs
        _h.mult_mv(x_hat, gamma.reshape((1, gamma.size)), out)
        _h.add_mv(out, beta.reshape((1, beta.size)), out)
    def forward_pass(self, buffers, training_pass=True):
        _h = self.handler
        sigma_b, centered, x_hat = buffers.internals
        gamma, beta, mu, sigma = buffers.parameters
        # Note: we flatten time for all buffers, so we skip the flat_ prefix
        inputs = flatten_all_but_last(buffers.inputs.default)
        centered = flatten_all_but_last(centered)
        x_hat = flatten_all_but_last(x_hat)
        out = flatten_all_but_last(buffers.outputs.default)
        m = inputs.shape[0]

        if training_pass:
            mu_b = sigma_b  # temporary use this with other name
            # Calculate the (negative) batch mean
            _h.sum_t(inputs, 0, mu_b)
            _h.mult_st(-1.0 / m, mu_b, mu_b)

            # Adjust mu as an exponential moving average
            # TODO: Find better way
            _h.mult_st(self.decay, mu, mu)
            _h.mult_add_st(1.0 - self.decay, mu_b, mu)

            mu = mu_b

        # Calculate the centered activations
        _h.add_mv(inputs, mu.reshape((1, mu.size)), centered)

        if training_pass:
            sigma2 = sigma_b  # temporary use this with other name
            centered2 = x_hat  # temporary use this with other name
            # Calculate the variance
            _h.mult_tt(centered, centered, centered2)
            _h.sum_t(centered2, 0, sigma2)
            _h.mult_st(1.0 / m, sigma2, sigma2)  # TODO m-1 instead?
            _h.add_st(self.epsilon, sigma2, sigma2)  # (numerically stabilized)

            # Standard deviation
            _h.sqrt_t(sigma2, sigma_b)

            # Adjust sigma as an exponential moving sigma
            # FIXME: This is clearly a hack and wrong
            _h.mult_st(self.decay, sigma, sigma)
            _h.mult_add_st(1.0 - self.decay, sigma_b, sigma)

            sigma = sigma_b

        # compute normalized inputs
        _h.divide_mv(centered, sigma.reshape((1, sigma.size)), x_hat)

        # Compute outputs
        _h.mult_mv(x_hat, gamma.reshape((1, gamma.size)), out)
        _h.add_mv(out, beta.reshape((1, beta.size)), out)
예제 #6
0
    def forward_pass(self, buffers, training_pass=True):
        # prepare
        _h = self.handler
        inputs = buffers.inputs.default
        labels = buffers.inputs.labels
        if 'mask' in buffers.inputs.keys():
            mask = buffers.inputs.mask
        else:
            mask = None
        predictions = buffers.outputs.predictions
        loss = buffers.outputs.loss

        temp_dinputs = buffers.internals.temp_dinputs

        # reshape
        flat_inputs = flatten_all_but_last(inputs)
        flat_probs = flatten_all_but_last(predictions)

        # softmax
        _h.softmax_m(flat_inputs, flat_probs)

        # At this point, softmax is computed, and CTC code begins.
        # The variable predictions has already been correctly filled
        # Here, we compute the loss, and we save the deltas to temp_dinputs
        # for being used in the backward pass.
        # All this is performed sequence-wise and currently does not parallelize.
        for sequence in xrange(inputs.shape[1]):
            if mask is not None:
                this_mask = mask[:,sequence,0].astype(int) # TODO: astype OK?
                mask_zero_index = _h.get_final_zeros_index_v(this_mask) 
#                 these_predictions = predictions[this_mask,sequence,:]
                these_predictions = predictions[0:mask_zero_index,sequence,:]
            else:
                these_predictions = predictions[:,sequence,:]

            these_uncut_labels = labels[:,sequence,0].astype(np.int64)

            final_zero_index = _h.get_final_zeros_index_v(these_uncut_labels)
            these_cut_labels = these_uncut_labels[0:final_zero_index]

            these_deltas = _h.allocate(these_predictions.shape)
            this_error = _h.calculate_ctc(these_predictions,these_cut_labels,these_deltas)
            _h.mult_st(-1, these_deltas, these_deltas) # fold "minus one" into calculate_ctc?

            # TODO annoying: single float no work on GPU
            loss[sequence,0] = np.array(this_error,dtype=loss.dtype)

            if mask is not None:
#                 temp_dinputs[this_mask.astype(bool),sequence,:] = these_deltas
                temp_dinputs[0:mask_zero_index,sequence,:] = these_deltas
            else:
                temp_dinputs[:,sequence,:] = these_deltas
예제 #7
0
    def conv2d_forward_batch(self, inputs, params, bias, outputs,
                             padding, stride):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = outputs.shape[1] * outputs.shape[2]
        num_kernel_params = np.prod(kernel_shape)
        out_shape = (num_output_pixels, num_filters)
        num_cuda_kernels = num_output_pixels * num_input_maps

        for i in range(num_images):
            col = self.zeros((num_output_pixels, num_kernel_params))
            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(outputs.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(NUM_CUDA_THREADS, 1, 1),
                              grid=(get_blocks(num_cuda_kernels), 1))

            reshaped_params = params.reshape(num_filters, num_kernel_params)
            culinalg.dot(col, reshaped_params, transb='T',
                         out=outputs[i].reshape(out_shape))

        flat_outputs = flatten_all_but_last(outputs)
        self.add_mv(flat_outputs, bias, flat_outputs)
예제 #8
0
    def conv2d_forward_batch(self, inputs, params, bias, outputs,
                             padding, stride):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = outputs.shape[1] * outputs.shape[2]
        num_kernel_params = np.prod(kernel_shape)
        out_shape = (num_output_pixels, num_filters)
        num_cuda_kernels = num_output_pixels * num_input_maps

        for i in range(num_images):
            col = self.zeros((num_output_pixels, num_kernel_params))
            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(outputs.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(get_blocks(num_cuda_kernels), 1, 1),
                              grid=(NUM_CUDA_THREADS, 1, 1))

            reshaped_params = params.reshape(num_filters, num_kernel_params)
            culinalg.dot(col, reshaped_params, transb='T',
                         out=outputs[i].reshape(out_shape))

        flat_outputs = flatten_all_but_last(outputs)
        self.add_mv(flat_outputs, bias, flat_outputs)
    def backward_pass(self, buffers):
        _h = self.handler
        sigma_b, centered, x_hat = buffers.internals
        gamma = buffers.parameters.gamma
        dgamma = buffers.gradients.gamma
        dbeta = buffers.gradients.beta
        # Note: we flatten time for all buffers, so we skip the flat_ prefix
        x_hat = flatten_all_but_last(x_hat)
        outdeltas = flatten_all_but_last(buffers.output_deltas.default)
        indeltas = flatten_all_but_last(buffers.input_deltas.default)
        m = outdeltas.shape[0]

        big_tmp = _h.allocate(x_hat.shape)     # big
        small_tmp = _h.allocate(gamma.shape)  # small

        # ------------- Gradients ---------------
        # Calculate dgamma
        tmp = big_tmp
        dgamma_tmp = small_tmp
        _h.mult_tt(outdeltas, x_hat, tmp)
        _h.sum_t(tmp, axis=0, out=dgamma_tmp)
        _h.add_tt(dgamma_tmp, dgamma, dgamma)

        _h.mult_st(1 / m, dgamma_tmp, dgamma_tmp)
        term1 = big_tmp
        _h.mult_mv(x_hat, dgamma_tmp.reshape((1, gamma.size)), term1)

        # Calculate dbeta
        dbeta_tmp = small_tmp
        _h.sum_t(outdeltas, axis=0, out=dbeta_tmp)
        _h.add_tt(dbeta_tmp, dbeta, dbeta)
        _h.mult_st(1 / m, dbeta_tmp, dbeta_tmp)

        # ------------- Deltas ---------------
        term2 = big_tmp
        term3 = big_tmp
        _h.subtract_tt(outdeltas, term1, term2)
        _h.subtract_mv(term2, dbeta_tmp.reshape((1, dbeta.size)), term3)

        # get normalization factor (gamma / sigma_b)
        coeff = small_tmp
        _h.divide_tt(gamma, sigma_b, coeff)

        term4 = big_tmp
        _h.mult_mv(term3, coeff.reshape((1, coeff.size)), term4)
        _h.add_tt(term4, indeltas, indeltas)
    def backward_pass(self, buffers):
        _h = self.handler
        sigma_b, centered, x_hat = buffers.internals
        gamma = buffers.parameters.gamma
        dgamma = buffers.gradients.gamma
        dbeta = buffers.gradients.beta
        # Note: we flatten time for all buffers, so we skip the flat_ prefix
        x_hat = flatten_all_but_last(x_hat)
        outdeltas = flatten_all_but_last(buffers.output_deltas.default)
        indeltas = flatten_all_but_last(buffers.input_deltas.default)
        m = outdeltas.shape[0]

        big_tmp = _h.allocate(x_hat.shape)  # big
        small_tmp = _h.allocate(gamma.shape)  # small

        # ------------- Gradients ---------------
        # Calculate dgamma
        tmp = big_tmp
        dgamma_tmp = small_tmp
        _h.mult_tt(outdeltas, x_hat, tmp)
        _h.sum_t(tmp, axis=0, out=dgamma_tmp)
        _h.add_tt(dgamma_tmp, dgamma, dgamma)

        _h.mult_st(1 / m, dgamma_tmp, dgamma_tmp)
        term1 = big_tmp
        _h.mult_mv(x_hat, dgamma_tmp.reshape((1, gamma.size)), term1)

        # Calculate dbeta
        dbeta_tmp = small_tmp
        _h.sum_t(outdeltas, axis=0, out=dbeta_tmp)
        _h.add_tt(dbeta_tmp, dbeta, dbeta)
        _h.mult_st(1 / m, dbeta_tmp, dbeta_tmp)

        # ------------- Deltas ---------------
        term2 = big_tmp
        term3 = big_tmp
        _h.subtract_tt(outdeltas, term1, term2)
        _h.subtract_mv(term2, dbeta_tmp.reshape((1, dbeta.size)), term3)

        # get normalization factor (gamma / sigma_b)
        coeff = small_tmp
        _h.divide_tt(gamma, sigma_b, coeff)

        term4 = big_tmp
        _h.mult_mv(term3, coeff.reshape((1, coeff.size)), term4)
        _h.add_tt(term4, indeltas, indeltas)
예제 #11
0
    def backward_pass(self, buffers):
        # prepare
        _h = self.handler
        targets = buffers.inputs.targets
        probs = buffers.outputs.probabilities

        dinputs = buffers.input_deltas.default
        dloss = buffers.output_deltas.loss
        t_bin = buffers.internals.t_bin

        # reshape
        flat_probs = flatten_all_but_last(probs)
        flat_targets = flatten_all_but_last(targets)
        flat_t_bin = flatten_all_but_last(t_bin)
        flat_dloss = flatten_all_but_last(dloss)
        flat_dinputs = flatten_all_but_last(dinputs)

        # derivative of multinomial cross-entropy error wrt softmax:
        # y - t
        _h.binarize_v(flat_targets, flat_t_bin)
        _h.mult_st(-1, flat_t_bin, flat_t_bin)
        _h.add_tt(flat_t_bin, flat_probs, flat_t_bin)
        _h.mult_mv(flat_t_bin, flat_dloss, flat_t_bin)
        _h.add_tt(flat_t_bin, flat_dinputs, flat_dinputs)
예제 #12
0
    def forward_pass(self, buffers, training_pass=True):
        # prepare
        _h = self.handler
        inputs = buffers.inputs.default
        targets = buffers.inputs.targets
        predictions = buffers.outputs.predictions
        loss = buffers.outputs.loss

        # reshape
        flat_inputs = flatten_all_but_last(inputs)
        flat_probs = flatten_all_but_last(predictions)
        flat_loss = flatten_all_but_last(loss)
        flat_targets = flatten_all_but_last(targets)

        # softmax
        _h.softmax_m(flat_inputs, flat_probs)

        # the multinomial cross entropy error is given by
        # - sum over i: p_i * ln(y_i)
        _h.copy_to(flat_probs, flat_loss)
        _h.clip_t(flat_loss, 1e-6, 1.0, flat_loss)
        _h.log_t(flat_loss, flat_loss)
        _h.mult_tt(flat_loss, flat_targets, flat_loss)
        _h.mult_st(-1, loss, loss)
예제 #13
0
    def backward_pass(self, buffers):
        # prepare
        _h = self.handler
        targets = buffers.inputs.targets
        probs = buffers.outputs.predictions

        dinputs = buffers.input_deltas.default
        dloss = buffers.output_deltas.loss
        t_bin = buffers.internals.t_bin

        # reshape
        flat_probs = flatten_all_but_last(probs)
        flat_targets = flatten_all_but_last(targets)
        flat_t_bin = flatten_all_but_last(t_bin)
        flat_dloss = flatten_all_but_last(dloss)
        flat_dinputs = flatten_all_but_last(dinputs)

        # derivative of multinomial cross-entropy error wrt softmax:
        # y - t
        _h.binarize_v(flat_targets, flat_t_bin)
        _h.mult_st(-1, flat_t_bin, flat_t_bin)
        _h.add_tt(flat_t_bin, flat_probs, flat_t_bin)
        _h.mult_mv(flat_t_bin, flat_dloss, flat_t_bin)
        _h.add_tt(flat_t_bin, flat_dinputs, flat_dinputs)