def lmul_T(self, x): """ .. todo:: WRITEME """ check_cuda(str(type(self)) + ".lmul_T") assert x.dtype == self._filters.dtype op_axes = ('c', 0, 1, 'b') axes = self.output_axes if tuple(axes) != op_axes: x = x.dimshuffle(*[axes.index(ax) for ax in op_axes]) x = gpu_contiguous(x) rval = ImageActs(pad=self.pad, partial_sum=self.partial_sum, stride=self.kernel_stride[0])(x, self._filters, output_shape=self.input_shape) # Format the output based on the input space axes = self.input_axes assert len(axes) == 4 if tuple(axes) != op_axes: rval = rval.dimshuffle(op_axes.index(axes[0]), op_axes.index(axes[1]), op_axes.index(axes[2]), op_axes.index(axes[3])) return rval
def grad(self, inputs, dout): """ .. todo:: WRITEME """ images, filters = inputs if 'Cuda' not in str(type(images)): raise TypeError("inputs must be cuda") if 'Cuda' not in str(type(filters)): raise TypeError("filters must be cuda") dout, = dout dout = gpu_contiguous(dout) if 'Cuda' not in str(type(dout)): raise TypeError("output gradients must be cuda") ishape = images.shape[1:3] fshape = filters.shape[1:3] d_images = ImageActs(self.pad, self.partial_sum, self.stride)(dout, filters, ishape) d_filters = WeightActs(self.pad, self.partial_sum, self.stride)(images, dout, fshape)[0] return d_images, d_filters
def __init__(self, input_layer, mirror_layer, nonlinearity=None): """ Only the valid border mode is supported. n_filters should be a multiple of 16 """ self.mirror_layer = mirror_layer self.input_layer = input_layer self.input_shape = self.input_layer.get_output_shape() n_filters = self.input_shape[0] if nonlinearity: self.nonlinearity = nonlinearity else: self.nonlinearity = mirror_layer.nonlinearity self.n_channels = mirror_layer.n_channels self.n_filters = mirror_layer.n_filters self.filter_size = mirror_layer.filter_size self.weights_std = mirror_layer.weights_std self.init_bias_value = mirror_layer.init_bias_value self.stride = mirror_layer.stride self.dropout = mirror_layer.dropout self.partial_sum = mirror_layer.partial_sum self.pad = mirror_layer.pad self.untie_biases = mirror_layer.untie_biases self.mb_size = self.input_layer.mb_size self.filter_shape = mirror_layer.filter_shape self.trainable = False self.W = layers.shared_single(4) if self.untie_biases: self.b = layers.shared_single(3) else: self.b = layers.shared_single(1) # self.params = [self.W, self.b] self.params = [self.W, self.b] self.bias_params = [self.b] self.data_order = layers.data_order.type2 assert (len(self.input_layer.get_output_shape()) == 4), \ 'Input must have 4 dimensions.' assert (self.input_layer.data_order == self.data_order), \ 'Input data order does not match this layer\'s data order.' self.reset_params() self.image_acts_op = ImageActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)
def __init__(self, input_layer, mirror_layer, nonlinearity=None): """ Only the valid border mode is supported. n_filters should be a multiple of 16 """ self.mirror_layer = mirror_layer self.input_layer = input_layer self.input_shape = self.input_layer.get_output_shape() n_filters = self.input_shape[0] if nonlinearity: self.nonlinearity = nonlinearity else: self.nonlinearity = mirror_layer.nonlinearity self.n_channels = mirror_layer.n_channels self.n_filters = mirror_layer.n_filters self.filter_size = mirror_layer.filter_size self.weights_std = mirror_layer.weights_std self.init_bias_value = mirror_layer.init_bias_value self.stride = mirror_layer.stride self.dropout = mirror_layer.dropout self.partial_sum = mirror_layer.partial_sum self.pad = mirror_layer.pad self.untie_biases = mirror_layer.untie_biases # if untie_biases == True, each position in the output map has its own # bias (as opposed to having the same bias everywhere for a filter) self.mb_size = self.input_layer.mb_size self.filter_shape = mirror_layer.filter_shape self.trainable = False self.W = mirror_layer.W self.b = mirror_layer.b # self.params = [self.W, self.b] self.params = [] self.bias_params = [self.b] self.data_order = layers.data_order.type2 assert (len(self.input_layer.get_output_shape()) == 4), \ 'Input must have 4 dimensions.' assert (self.input_layer.data_order == self.data_order), \ 'Input data order does not match this layer\'s data order.' self.image_acts_op = ImageActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)
def grad(self, inputs, dout): images, filters = inputs if 'Cuda' not in str(type(images)): raise TypeError("inputs must be cuda") if 'Cuda' not in str(type(filters)): raise TypeError("filters must be cuda") dout, = dout dout = gpu_contiguous(dout) if 'Cuda' not in str(type(dout)): raise TypeError("output gradients must be cuda") d_images = ImageActs(self.pad, self.partial_sum)(dout, filters) d_filters = WeightActs(self.pad, self.partial_sum)(images, dout)[0] return d_images, d_filters
def test_match_full_conv(): # Tests that running ImageActs with no padding is the same as running # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels # In other words, if convolution computes H=XK, we now compute # R=HK^T rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform(-1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size) ).astype('float32'), name='hidacts') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1,2,3,0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn("""test_match_full_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""") if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ',output.shape print 'theano shape: ',output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_match_full_conv_grad(): # Tests that the gradient of ImageActs with no padding is the same as the # gradient of # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform( -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size)).astype('float32'), name='hidacts') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) theano_rng = MRG_RandomStreams(5 * 10 * 2013) random = theano_rng.normal(size=output_conv2d.shape, dtype=output_conv2d.dtype) projected = (output * random).sum() projected_conv_2d = (output_conv2d * random).sum() grads = T.grad(projected, [hid_acts, filters]) + T.grad( projected_conv_2d, [hid_acts, filters]) f = function([], grads) gi, gf, gi_th, gf_th = f() assert gi.shape == gi_th.shape diff = np.abs(gi - gi_th).max() if diff > 2.9e-6: assert False diff = np.abs(gf - gf_th).max() if diff > 1e-6: raise AssertionError(diff)
def test_match_full_conv(): # Tests that running ImageActs with no padding is the same as running # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels # In other words, if convolution computes H=XK, we now compute # R=HK^T rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform( -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size)).astype('float32'), name='hidacts') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn( """test_match_full_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ', output.shape print 'theano shape: ', output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def benchmark(n_imgs, n_channels, img_shape, n_filters, filter_shape, pad): print('\nn_imgs: %i, n_channels: %i, img_shape: (%i, %i), ' % ((n_imgs, n_channels) + img_shape) + 'n_filters: %i, filter_shape: (%i, %i), pad: %i' % ((n_filters, ) + filter_shape + (pad, ))) # Setup arrays padding = (pad, pad) strides = (1, 1) img_h, img_w = img_shape filter_h, filter_w = filter_shape convout_h = img_h + 2 * pad - filter_h + 1 convout_w = img_w + 2 * pad - filter_w + 1 imgs_bc01_shape = (n_imgs, n_channels, img_h, img_w) filters_bc01_shape = (n_filters, n_channels, filter_h, filter_w) imgs_bc01 = np.random.randn(n_imgs, n_channels, img_h, img_w) imgs_c01b = np.transpose(imgs_bc01, (1, 2, 3, 0)) filters_fc01 = np.random.randn(n_filters, n_channels, filter_h, filter_w) filters_c01f = np.transpose(filters_fc01, (1, 2, 3, 0)) convout_bc01 = np.random.randn(n_imgs, n_filters, convout_h, convout_w) convout_c01b = np.transpose(convout_bc01, (1, 2, 3, 0)) imgs_bc01_t = theano.shared(imgs_bc01.astype(theano.config.floatX)) imgs_c01b_t = theano.shared(imgs_c01b.astype(theano.config.floatX)) filters_fc01_t = theano.shared(filters_fc01.astype(theano.config.floatX)) filters_c01f_t = theano.shared(filters_c01f.astype(theano.config.floatX)) convout_bc01_t = theano.shared(convout_bc01.astype(theano.config.floatX)) convout_c01b_t = theano.shared(convout_c01b.astype(theano.config.floatX)) imgs_bc01_ca = ca.array(imgs_bc01) filters_fc01_ca = ca.array(filters_fc01) convout_bc01_ca = ca.array(convout_bc01) # Forward propagation print('fprop') convout_cc_op = FilterActs(stride=1, partial_sum=4, pad=pad) convout_cc_expr = convout_cc_op(imgs_c01b_t, filters_c01f_t) convout_cc_fun = theano.function([], convout_cc_expr) convout_cc = convout_cc_fun() convout_cc = np.transpose(convout_cc, (3, 0, 1, 2)) def convout_ca_fun(): convout = ca.nnet.conv_bc01(imgs_bc01_ca, filters_fc01_ca, padding, strides) return convout convout_ca = np.array(convout_ca_fun()) print(' correct: ' + str(allclose(convout_ca, convout_cc))) duration_cc = avg_running_time(convout_cc_fun) duration_ca = avg_running_time(convout_ca_fun) print(' avg. duration: cuda_convnet: %.4f ca: %.4f' % (duration_cc, duration_ca)) print(' speedup: %.2f' % (duration_cc / duration_ca)) del convout_cc_op del convout_cc_expr del convout_cc_fun # Back propagation, imgs print('bprop_imgs') dimgs_cc_op = ImageActs(stride=1, partial_sum=1, pad=pad) dimgs_cc_expr = dimgs_cc_op(convout_c01b_t, filters_c01f_t) dimgs_cc_fun = theano.function([], dimgs_cc_expr) dimgs_cc = dimgs_cc_fun() dimgs_cc = np.transpose(dimgs_cc, (3, 0, 1, 2)) def dimgs_ca_fun(): return ca.nnet.conv_bc01_bprop_imgs(filters_fc01_ca, convout_bc01_ca, img_shape, padding, strides) dimgs_ca = np.array(dimgs_ca_fun()) print(' correct: ' + str(allclose(dimgs_ca, dimgs_cc))) duration_cc = avg_running_time(dimgs_cc_fun) duration_ca = avg_running_time(dimgs_ca_fun) print(' avg. duration: cuda_convnet: %.4f ca: %.4f' % (duration_cc, duration_ca)) print(' speedup: %.2f' % (duration_cc / duration_ca)) del dimgs_cc_op del dimgs_cc_expr del dimgs_cc_fun # Back propagation, filters dfilters_cc_op = WeightActs(stride=1, partial_sum=1, pad=pad) dfilters_cc_expr = dfilters_cc_op(imgs_c01b_t, convout_c01b_t, T.as_tensor_variable(filter_shape)) dfilters_cc_fun = theano.function([], dfilters_cc_expr) dfilters_cc = dfilters_cc_fun()[0] dfilters_cc = np.transpose(dfilters_cc, (3, 0, 1, 2)) def dfilters_ca_fun(): return ca.nnet.conv_bc01_bprop_filters(imgs_bc01_ca, convout_bc01_ca, filter_shape, padding, strides) dfilters_ca = np.array(dfilters_ca_fun()) print('bprop_filters') print(' correct: ' + str(allclose(dfilters_ca, dfilters_cc))) duration_cc = avg_running_time(dfilters_cc_fun) duration_ca = avg_running_time(dfilters_ca_fun) print(' avg. duration: cuda_convnet: %.4f ca: %.4f' % (duration_cc, duration_ca)) print(' speedup: %.2f' % (duration_cc / duration_ca))