def cpu_var_to_gpu_var(x): from theano.sandbox import cuda type = cuda.CudaNdarrayType(broadcastable=x.broadcastable) name = 'gpu_%s' % x.name name = None gpu_var = cuda.CudaNdarrayVariable(type=type, name=name) cpu_var = cuda.host_from_gpu(gpu_var) return gpu_var, cpu_var return cuda.host_from_gpu(cuda.CudaNdarrayVariable(type=type, name=name))
def cpu_var_to_gpu_var(x): from theano.sandbox import cuda type = cuda.CudaNdarrayType(broadcastable=x.broadcastable) name = 'gpu_%s'%x.name name = None gpu_var = cuda.CudaNdarrayVariable(type=type, name=name) cpu_var = cuda.host_from_gpu(gpu_var) return gpu_var, cpu_var return cuda.host_from_gpu(cuda.CudaNdarrayVariable(type=type, name=name))
def test_weight_acts_strided(): # Tests that WeightActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for partial_sum in [0, 1, 4]: print("partial_sum: %d"%(partial_sum)) for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') print("test case %d..."%(test_idx+1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) _, h_rows, h_cols, _ = output_python.shape if partial_sum == 4: if (h_rows*h_cols)%partial_sum != 0: print("skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum)) break hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride) weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)( gpu_images, gpu_hidacts, as_tensor_variable((filters.shape[1], filters.shape[2])) )[0] weights_grad = host_from_gpu(weights_grad) f = function([], weights_grad) weights_grad_val = f() warnings.warn("""test_weight_acts_strided success criterion is not very strict.""") if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5: assert type(weights_grad_val) == type(weights_grad_python) assert weights_grad_val.dtype == weights_grad_python.dtype if weights_grad_val.shape != weights_grad_python.shape: print('cuda-convnet shape: ',weights_grad_val.shape) print('python conv shape: ',weights_grad_python.shape) assert False err = np.abs(weights_grad_val - weights_grad_python) print('stride %d'%stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max())) print('python conv value range: ', (weights_grad_python.min(), weights_grad_python.max()))
def insert_gpu_weight_acts(node): """ .. todo:: WRITEME """ if isinstance(node.op, WeightActs): """ .. todo:: WRITEME """ images, hidacts, frows, fcols = node.inputs if any_from_gpu(images, hidacts) or any_gpu_client(*node.outputs): gpu_weight_acts = GpuWeightActs( module_stride=node.op.module_stride, partial_sum=1) return [ host_from_gpu( gpu_weight_acts( gpu_from_host(images), gpu_from_host(hidacts), frows, fcols, )) ]
def local_to_gpu(node): """ op(host_from_gpu()) -> host_from_gpu(op) gpu_from_host(op) -> op(gpu_from_host) """ if isinstance(node.op, op): #op(host_from_gpu()) -> host_from_gpu(op) #If any of the input that go on the GPU are on the GPU, #move the op to the gpu. if any(node.inputs[idx].owner and isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu) for idx in to_gpu): new_inp = list(node.inputs) for idx in to_gpu: new_inp[idx] = cuda.gpu_from_host(new_inp[idx]) return [cuda.host_from_gpu(op()(*new_inp))] if node.op == cuda.gpu_from_host: #gpu_from_host(op) -> op(gpu_from_host) host_input = node.inputs[0] if host_input.owner and isinstance(host_input.owner.op, op): op_node = host_input.owner new_inp = list(op_node.inputs) for idx in to_gpu: new_inp[idx] = cuda.gpu_from_host(new_inp[idx]) return [op()(*new_inp)] return False
def test_weight_acts_strided(): # Tests that WeightActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for partial_sum in [0, 1, 4]: print "partial_sum: %d"%(partial_sum) for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') print "test case %d..."%(test_idx+1) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) _, h_rows, h_cols, _ = output_python.shape if partial_sum == 4: if (h_rows*h_cols)%partial_sum != 0: print "skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum) break hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride) weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)( gpu_images, gpu_hidacts, as_tensor_variable((filters.shape[1], filters.shape[2])) )[0] weights_grad = host_from_gpu(weights_grad) f = function([], weights_grad) weights_grad_val = f() warnings.warn("""test_weight_acts_strided success criterion is not very strict.""") if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5: assert type(weights_grad_val) == type(weights_grad_python) assert weights_grad_val.dtype == weights_grad_python.dtype if weights_grad_val.shape != weights_grad_python.shape: print 'cuda-convnet shape: ',weights_grad_val.shape print 'python conv shape: ',weights_grad_python.shape assert False err = np.abs(weights_grad_val - weights_grad_python) print 'stride %d'%stride print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max()) print 'python conv value range: ', (weights_grad_python.min(), weights_grad_python.max())
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") # TODO Why is it CPU?? print "Por que?!?!", type(x) cpu = "Cuda" not in str(type(x)) if cpu: x = gpu_from_host(x) assert x.ndim == 5 x_axes = self.input_axes assert len(x_axes) == 5 op_axes = ("c", 0, 1, "t", "b") if tuple(x_axes) != op_axes: print "ssssssssssssssss" x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) _x_4d_shape = ( self.signal_shape[0], self.signal_shape[1], self.signal_shape[2], self.signal_shape[3] * self.signal_shape[4], ) x = x.reshape(_x_4d_shape) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(x, self._filters) if cpu: rval = host_from_gpu(rval) rval = rval.reshape( ( self.filter_shape[3], self.filter_shape[4], rval.shape[1], rval.shape[2], self.signal_shape[3], self.signal_shape[4], ) ) rval = diagonal_subtensor(rval, 4, 0).sum(axis=0) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 5 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) return rval
def local_gpu_togpu(node): if node.op == gpu_from_host: host_input = node.inputs[0] if host_input.owner and hasattr(host_input.owner.op, 'make_gpu_node'): try: gpu_inputs = list( map(gpu_from_host, host_input.owner.inputs)) except TypeError: return False return [host_input.owner.op.make_gpu_node(*gpu_inputs)] elif hasattr(node.op, 'make_gpu_node') and all( [x.owner and x.owner.op == host_from_gpu for x in node.inputs]): gpu_inputs = [x.owner.inputs[0] for x in node.inputs] return [host_from_gpu(node.op.make_gpu_node(*gpu_inputs))] return False
def local_gpu_conv_transp3d(node): if isinstance(node.op, ConvTransp3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): W, b, d, H, RShape = node.inputs return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
def test_match_valid_conv(): # Tests that running FilterActs with no padding is the same as running # theano's conv2D in valid mode rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ', output.shape print 'theano shape: ', output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def local_to_gpu(node): """ op(host_from_gpu()) -> host_from_gpu(op) gpu_from_host(op) -> op(gpu_from_host) """ if isinstance(node.op, op): # op(host_from_gpu()) -> host_from_gpu(op) # If any of the input that go on the GPU are on the GPU, # move the op to the gpu. if any(node.inputs[idx].owner and isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu) for idx in to_gpu): new_inp = list(node.inputs) for idx in to_gpu: new_inp[idx] = cuda.gpu_from_host(new_inp[idx]) return [cuda.host_from_gpu(op()(*new_inp))] if node.op == cuda.gpu_from_host: # gpu_from_host(op) -> op(gpu_from_host) host_input = node.inputs[0] if host_input.owner and isinstance(host_input.owner.op, op): op_node = host_input.owner new_inp = list(op_node.inputs) for idx in to_gpu: new_inp[idx] = cuda.gpu_from_host(new_inp[idx]) return [op()(*new_inp)] return False
def optimize(node): if isinstance(node.op, cuda.GpuFromHost): # gpu_from_host(cpu_op) -> gpu_op(gpu_from_host) host_input = node.inputs[0] if host_input.owner and isinstance(host_input.owner.op, CpuOpCls): cpu_op = host_input.owner.op args = dict(zip(cpu_op.__props__, cpu_op._props())) gpu_op = GpuOpCls(**args) inputs = host_input.owner.inputs out = gpu_op(*inputs) return [out] if isinstance(node.op, CpuOpCls): # cpu_op(host_from_gpu) -> host_from_gpu(gpu_op) def _is_variable_on_gpu(var): return var.owner and isinstance(var.owner.op, cuda.HostFromGpu) inputs = node.inputs inputs_on_gpu = map(_is_variable_on_gpu, inputs) if any(inputs_on_gpu): cpu_op = node.op args = dict(zip(cpu_op.__props__, cpu_op._props())) gpu_op = GpuOpCls(**args) out = gpu_op(*inputs) out = cuda.host_from_gpu(out) return [out] return False
def local_gpu_Contiguous(node): if isinstance(node.op, Contiguous): # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py from theano.sandbox.cuda import host_from_gpu x, = node.inputs if x.owner and x.owner.op == host_from_gpu: from theano.sandbox.cuda.basic_ops import gpu_contiguous return [host_from_gpu(gpu_contiguous(x.owner.inputs[0]))]
def test_attention_time_gauss(): n_T = 4 n_batch = 2 n_inp_dim = 3 n_cells = 5 n_B = 5 custom_op = get_attention(RecurrentTransform.AttentionTimeGauss, n_out=n_cells, n_batches=n_batch, n_input_t=n_B, n_input_dim=n_inp_dim) att = custom_op.recurrent_transform Z_val = numpy.random.ranf((n_T, n_batch, 4 * n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_quadr_val = numpy.eye(n_B).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') B_val = numpy.random.ranf((n_B, n_batch, n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') i_val = numpy.ones((n_T, n_batch), dtype='int8') Z = T.ftensor3('Z') B = T.ftensor3('B') #base W_re = T.fmatrix('W_re') W_att_quadr = T.fmatrix("W_att_quadr") W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i', dtype='int8') t0 = T.fvector('t0') custom_vars = att.get_sorted_custom_vars() initial_state_vars = att.get_sorted_state_vars_initial() custom_op_inputs = [Z, c, y0, i, W_re] + custom_vars + initial_state_vars print "input args num:", len(custom_op_inputs) print "input args:", custom_op_inputs custom_op_outputs = custom_op(*custom_op_inputs) print "output args num:", len(custom_op_outputs) custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs] f = theano.function(inputs=[Z, c, y0, i, W_re], outputs=custom_op_outputs) res = f(Z_val, c_val, y0_val, i_val, W_re_val) #print res # res: (output) Y, (gates and cell state) H, (final cell state) d, state vars sequences (Y, H, d), state_var_seqs = res[:3], res[3:] # print "running custom dumped data" # custom_op_inputs = [theano.shared(numpy.load("../op.i.%i" % i)) for i in range(12)] # custom_op_outputs = custom_op(*custom_op_inputs) # custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs] # f = theano.function(inputs=[], outputs=custom_op_outputs) # res = f() print res assert False
def local_gpu_TorchWrapper(node): if isinstance(node.op, TorchWrapperOp): from theano.sandbox.cuda import host_from_gpu, gpu_from_host args = node.inputs if any([(x.owner and x.owner.op == host_from_gpu) for x in args]): gpu_op = GpuTorchWrapperOp(**{key: getattr(node.op, key) for key in node.op.__props__}) args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x for x in args] return [host_from_gpu(gpu_op(*args))]
def test_grad(): rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1.0, 1.0, (channels, rows, cols, batch_size)).astype("float32"), name="images") filters = shared( rng.uniform(-1.0, 1.0, (channels, filter_rows, filter_cols, num_filters)).astype("float32"), name="filters" ) gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) # XXX: use verify_grad output_grad = grad(output.sum(), images) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode="valid") output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) # XXX: use verify_grad output_conv2d_grad = grad(output_conv2d.sum(), images) f = function([], [output_grad, output_conv2d_grad]) output_grad, output_conv2d_grad = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(output_grad - output_conv2d_grad).max() > 7.7e-6: assert type(output_grad) == type(output_conv2d_grad) assert output_grad.dtype == output_conv2d_grad.dtype if output_grad.shape != output_conv2d_grad.shape: print "cuda-convnet shape: ", output_grad.shape print "theano shape: ", output_conv2d_grad.shape assert False err = np.abs(output_grad - output_conv2d_grad) print "absolute error range: ", (err.min(), err.max()) print "mean absolute error: ", err.mean() print "cuda-convnet value range: ", (output_grad.min(), output_grad.max()) print "theano value range: ", (output_conv2d_grad.min(), output_conv2d_grad.max()) assert False
def test_match_valid_conv(): # Tests that running FilterActs with no padding is the same as running # theano's conv2D in valid mode rng = np.random.RandomState([2012,10,9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) filters_bc01 = filters_bc01[:,:,::-1,::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1,2,3,0) try: f = function([], [output, output_conv2d]) except: raise KnownFailureTest("cuda-convnet code depends on an unmerged theano feature.") output, output_conv2d = f() warnings.warn("test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?") if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ',output.shape print 'theano shape: ',output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_match_valid_conv_strided(): # Tests that running FilterActs with stride is the same as running # theano's conv2D in valid mode and then downsampling rng = np.random.RandomState([2012,10,9]) batch_size = 5 rows = 9 cols = 9 channels = 3 filter_rows = 3 filter_cols = filter_rows stride = 3 num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) filters_bc01 = filters_bc01[:,:,::-1,::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid', subsample=(stride, stride)) output_conv2d_orig = output_conv2d.dimshuffle(1,2,3,0) output_conv2d = output_conv2d_orig # [:, ::stride, ::stride, :] f = function([], [output, output_conv2d, output_conv2d_orig]) output, output_conv2d, output_conv2d_orig = f() warnings.warn("""test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""") if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ',output.shape print 'theano shape: ',output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def insert_gpu_filter_acts(node): if isinstance(node.op, FilterActs): images, filters = node.inputs if any_from_gpu(images, filters) or any_gpu_client(*node.outputs): gpu_filter_acts = GpuFilterActs( module_stride=node.op.module_stride, partial_sum=1) return [host_from_gpu(gpu_filter_acts( gpu_from_host(images), gpu_from_host(filters)))]
def local_gpu_togpu_breakpoint(node): if isinstance(node.op, Breakpoint): result_input = node.inputs[0] if result_input.owner and result_input.owner.op == host_from_gpu: gpu_inputs = [x.owner.inputs[0] if x.owner and x.owner.op == host_from_gpu else x for x in node.inputs] return [host_from_gpu(node.op.make_gpu_node(*gpu_inputs))] return False
def test_attention_time_gauss(): n_T = 4 n_batch = 2 n_inp_dim = 3 n_cells = 5 n_B = 5 custom_op = get_attention(RecurrentTransform.AttentionTimeGauss, n_out=n_cells, n_batches=n_batch, n_input_t=n_B, n_input_dim=n_inp_dim) att = custom_op.recurrent_transform Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_quadr_val = numpy.eye(n_B).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') B_val = numpy.random.ranf((n_B,n_batch,n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') i_val = numpy.ones((n_T, n_batch), dtype='int8') Z = T.ftensor3('Z') B = T.ftensor3('B') #base W_re = T.fmatrix('W_re') W_att_quadr = T.fmatrix("W_att_quadr") W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') t0 = T.fvector('t0') custom_vars = att.get_sorted_custom_vars() initial_state_vars = att.get_sorted_state_vars_initial() custom_op_inputs = [Z, c, y0, i, W_re] + custom_vars + initial_state_vars print("input args num:", len(custom_op_inputs)) print("input args:", custom_op_inputs) custom_op_outputs = custom_op(*custom_op_inputs) print("output args num:", len(custom_op_outputs)) custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs] f = theano.function(inputs=[Z, c, y0, i, W_re], outputs=custom_op_outputs) res = f(Z_val, c_val, y0_val, i_val, W_re_val) #print res # res: (output) Y, (gates and cell state) H, (final cell state) d, state vars sequences (Y, H, d), state_var_seqs = res[:3], res[3:] # print "running custom dumped data" # custom_op_inputs = [theano.shared(numpy.load("../op.i.%i" % i)) for i in range(12)] # custom_op_outputs = custom_op(*custom_op_inputs) # custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs] # f = theano.function(inputs=[], outputs=custom_op_outputs) # res = f() print(res) assert False
def local_gpu_conv_grad3d(node): if isinstance(node.op, ConvGrad3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, d, WShape, dCdH = node.inputs return [host_from_gpu(gpu_conv_grad3d( as_cuda_ndarray_variable(V), d, WShape, as_cuda_ndarray_variable(dCdH)))]
def test_viewop_gpu(): from theano.sandbox import cuda if cuda.cuda_available == False: raise SkipTest('Optional package cuda disabled') _x = theano.tensor.fvector('x') x = cuda.gpu_from_host(_x) _out = theano.compile.ViewOp()(x) out = cuda.host_from_gpu(_out) f = theano.function([x], out, mode=mode_with_gpu) data = numpy.array([1, 2, 3], dtype='float32') assert numpy.allclose(f(data), data)
def insert_gpu_filter_acts(node): if isinstance(node.op, FilterActs): images, filters = node.inputs if any_from_gpu(images, filters) or any_gpu_client(*node.outputs): gpu_filter_acts = GpuFilterActs( module_stride=node.op.module_stride, partial_sum=1) return [ host_from_gpu( gpu_filter_acts(gpu_from_host(images), gpu_from_host(filters))) ]
def local_gpu_NativeOp(node): if isinstance(node.op, NativeOp): # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py from theano.sandbox.cuda import host_from_gpu, gpu_from_host, as_cuda_ndarray_variable args = node.inputs if any([(x.owner and x.owner.op == host_from_gpu) for x in args]): gpu_op = GpuNativeOp(**{key: getattr(node.op, key) for key in node.op.__props__}) args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x for x in args] from TheanoUtil import make_var_tuple outputs = make_var_tuple(gpu_op(*args)) return [host_from_gpu(out) for out in outputs]
def test_viewop_gpu(): from theano.sandbox import cuda if cuda.cuda_available == False: raise SkipTest("Optional package cuda disabled") _x = theano.tensor.fvector("x") x = cuda.gpu_from_host(_x) _out = theano.compile.ViewOp()(x) out = cuda.host_from_gpu(_out) f = theano.function([x], out, mode=mode_with_gpu) data = numpy.array([1, 2, 3], dtype="float32") assert numpy.allclose(f(data), data)
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") # TODO Why is it CPU?? print 'Por que?!?!', type(x) cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) assert x.ndim == 5 x_axes = self.input_axes assert len(x_axes) == 5 op_axes = ('c', 0, 1, 't', 'b') if tuple(x_axes) != op_axes: print 'ssssssssssssssss' x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) _x_4d_shape = (self.signal_shape[0], self.signal_shape[1], self.signal_shape[2], self.signal_shape[3] * self.signal_shape[4]) x = x.reshape(_x_4d_shape) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(x, self._filters) if cpu: rval = host_from_gpu(rval) rval = rval.reshape( (self.filter_shape[3], self.filter_shape[4], rval.shape[1], rval.shape[2], self.signal_shape[3], self.signal_shape[4])) rval = diagonal_subtensor(rval, 4, 0).sum(axis=0) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 5 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle( *[op_axes.index(axis) for axis in rval_axes]) return rval
def insert_gpu_weight_acts(node): if isinstance(node.op, WeightActs): images, hidacts, frows, fcols = node.inputs if any_from_gpu(images, hidacts) or any_gpu_client(*node.outputs): gpu_weight_acts = GpuWeightActs( module_stride=node.op.module_stride, partial_sum=1) return [host_from_gpu(gpu_weight_acts( gpu_from_host(images), gpu_from_host(hidacts), frows, fcols, ))]
def insert_gpu_img_acts(node): if isinstance(node.op, ImgActs): filters, hidacts, irows, icols = node.inputs if any_from_gpu(filters, hidacts) or any_gpu_client(*node.outputs): gpu_img_acts = GpuImgActs( module_stride=node.op.module_stride, partial_sum=1) return [host_from_gpu(gpu_img_acts( gpu_from_host(filters), gpu_from_host(hidacts), irows, icols, ))]
def local_gpu_alloc_diagonal(node): if (isinstance(node.op, AllocDiag) and isinstance(node.inputs[0].type, theano.tensor.TensorType)): inp = node.inputs[0] if inp.owner and isinstance(inp.owner.op, cuda.HostFromGpu): diag = inp.owner.inputs[0] y = cuda.gpu_from_host(tensor.alloc(numpy.asarray(0, dtype=diag.dtype), diag.shape[0], diag.shape[0])) y = theano.tensor.nnet.conv3d2d.IncDiagonalSubtensor()(y, 0, 1, diag) return [cuda.host_from_gpu(y)] else: return False return False
def test_image_acts_strided(): # Tests that running FilterActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') gpu_filters = float32_shared_constructor(filters,name='filters') print("test case %d..."%(test_idx+1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2])) Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2]))) Img_output = host_from_gpu(Img_output) f = function([], Img_output) Img_output_val = f() warnings.warn("""test_image_acts_strided success criterion is not very strict.""") if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5: assert type(Img_output_val) == type(Img_output_python) assert Img_output_val.dtype == Img_output_python.dtype if Img_output_val.shape != Img_output_python.shape: print('cuda-convnet shape: ',Img_output_val.shape) print('python conv shape: ',Img_output_python.shape) assert False err = np.abs(Img_output_val - Img_output_python) print('stride %d'%stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max())) print('python conv value range: ', (Img_output_python.min(), Img_output_python.max()))
def test_filter_acts_strided(): # Tests that FilterActs with all possible strides rng = np.random.RandomState([2012, 10, 9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [ [(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images, name='images') gpu_filters = float32_shared_constructor(filters, name='filters') print("test case %d..." % (test_idx + 1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) f = function([], output) output_val = f() output_python = FilterActs_python(images, filters, stride) if np.abs(output_val - output_python).max() > 8.6e-6: assert type(output_val) == type(output_python) assert output_val.dtype == output_python.dtype if output_val.shape != output_python.shape: print('cuda-convnet shape: ', output_val.shape) print('python conv shape: ', output_python.shape) assert False err = np.abs(output_val - output_python) print('stride %d' % stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (output_val.min(), output_val.max())) print('python conv value range: ', (output_python.min(), output_python.max()))
def test_image_acts_strided(): # Tests that running FilterActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') gpu_filters = float32_shared_constructor(filters,name='filters') print "test case %d..."%(test_idx+1) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2])) Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2]))) Img_output = host_from_gpu(Img_output) f = function([], Img_output) Img_output_val = f() warnings.warn("""test_image_acts_strided success criterion is not very strict.""") if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5: assert type(Img_output_val) == type(Img_output_python) assert Img_output_val.dtype == Img_output_python.dtype if Img_output_val.shape != Img_output_python.shape: print 'cuda-convnet shape: ',Img_output_val.shape print 'python conv shape: ',Img_output_python.shape assert False err = np.abs(Img_output_val - Img_output_python) print 'stride %d'%stride print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max()) print 'python conv value range: ', (Img_output_python.min(), Img_output_python.max())
def lmul(self, x): """ .. todo:: WRITEME properly dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) x = gpu_contiguous(x) # Patch old pickle files. if not hasattr(self, 'kernel_stride'): self.kernel_stride = (1, 1) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])( x, self._filters ) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if cpu: rval = host_from_gpu(rval) if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) return rval
def lmul(self, x): """ .. todo:: WRITEME properly dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in x_axes]) x = gpu_contiguous(x) # Patch old pickle files. if not hasattr(self, 'kernel_stride'): self.kernel_stride = (1, 1) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])( x, self._filters ) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if cpu: rval = host_from_gpu(rval) if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) return rval
def _local_gpu_native_op(node): if isinstance(node.op, TheanoNativeOp): # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py # noinspection PyUnresolvedReferences,PyPackageRequirements from theano.sandbox.cuda import host_from_gpu, gpu_from_host, as_cuda_ndarray_variable args = node.inputs if any([(x.owner and x.owner.op == host_from_gpu) for x in args]): gpu_op = TheanoGpuNativeOp(**{key: getattr(node.op, key) for key in node.op.__props__}) args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x for x in args] from returnn.theano.util import make_var_tuple # noinspection PyCallingNonCallable outputs = make_var_tuple(gpu_op(*args)) return [host_from_gpu(out) for out in outputs]
def grab_lr(v): if v.owner is not None: n = v.owner if isinstance(n.op, GpuDimShuffle) and n.op.new_order == ("x", "x", "x", "x"): return host_from_gpu(n.inputs[0]) elif isinstance(n.op, DimShuffle) and n.op.new_order == ("x", "x", "x", "x"): return n.inputs[0] elif isinstance(n.op, GpuFromHost): return grab_lr(n.inputs[0]) else: return None else: if isinstance(v, Constant) and v.broadcastable == (True, True, True, True): return v.dimshuffle(())
def insert_gpu_img_acts(node): if isinstance(node.op, ImgActs): filters, hidacts, irows, icols = node.inputs if any_from_gpu(filters, hidacts) or any_gpu_client(*node.outputs): gpu_img_acts = GpuImgActs(module_stride=node.op.module_stride, partial_sum=1) return [ host_from_gpu( gpu_img_acts( gpu_from_host(filters), gpu_from_host(hidacts), irows, icols, )) ]
def local_gpu_togpu(node): if node.op == gpu_from_host: host_input = node.inputs[0] if host_input.owner and \ hasattr(host_input.owner.op, 'make_gpu_node'): try: gpu_inputs = map(gpu_from_host, host_input.owner.inputs) except TypeError: return False return [host_input.owner.op.make_gpu_node(*gpu_inputs)] elif hasattr(node.op, 'make_gpu_node') and \ all([x.owner and x.owner.op == host_from_gpu for x in node.inputs]): gpu_inputs = [x.owner.inputs[0] for x in node.inputs] return [host_from_gpu(node.op.make_gpu_node(*gpu_inputs))] return False
def grab_lr(v): if v.owner is not None: n = v.owner if (isinstance(n.op, GpuDimShuffle) and n.op.new_order == ('x', 'x', 'x', 'x')): return host_from_gpu(n.inputs[0]) elif (isinstance(n.op, DimShuffle) and n.op.new_order == ('x', 'x', 'x', 'x')): return n.inputs[0] elif isinstance(n.op, GpuFromHost): return grab_lr(n.inputs[0]) else: return None else: if (isinstance(v, Constant) and v.broadcastable == (True, True, True, True)): return v.dimshuffle(())
def test_filter_acts_strided(): # Tests that FilterActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') gpu_filters = float32_shared_constructor(filters,name='filters') print "test case %d..."%(test_idx+1) for ii in xrange(filters.shape[1]): stride = ii + 1 output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) f = function([], output) output_val = f() output_python = FilterActs_python(images,filters,stride) if np.abs(output_val - output_python).max() > 8.6e-6: assert type(output_val) == type(output_python) assert output_val.dtype == output_python.dtype if output_val.shape != output_python.shape: print 'cuda-convnet shape: ',output_val.shape print 'python conv shape: ',output_python.shape assert False err = np.abs(output_val - output_python) print 'stride %d'%stride print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output_val.min(), output_val.max()) print 'python conv value range: ', (output_python.min(), output_python.max())
def make_bwd_fun(recurrent_transform): y_p = recurrent_transform.y_p z_re, state_updates = recurrent_transform.step(y_p) custom_vars = recurrent_transform.get_sorted_custom_vars() state_vars_prev = recurrent_transform.get_sorted_state_vars() Dz_re = recurrent_transform.tt.fmatrix("Dz_re") state_var_new_grads = {state_updates[v]: v.type("D_" + v.name) for v in state_vars_prev} state_var_new_grads_list = [state_var_new_grads[state_updates[k]] for k in state_vars_prev] known_grads = {z_re: Dz_re} known_grads.update(state_var_new_grads) if recurrent_transform.force_gpu: # We need the symbolic host representation. # See HostFromGpu.grad(). It expects that the output_grads are on the host, i.e. from type T.TensorType. # When this is taken out of known_grads, it will fail because they are all CudaNdarrayType. # This should anyway be optimized all away and fully taken to the GPU in the final function. for k, v in known_grads.items(): known_grads[k] = theano_cuda.host_from_gpu(v) all_wrt = [y_p] + custom_vars + state_vars_prev all_grads = T.grad(None, all_wrt, known_grads=OrderedDict(known_grads), disconnected_inputs="ignore") assert len(all_grads) == 1 + len(custom_vars) + len(state_vars_prev) Dy_p = all_grads[0] custom_grads = all_grads[1:len(custom_vars)+1] state_var_prev_grads = all_grads[len(custom_vars)+1:] out_Dy_p = recurrent_transform.layer.shared(value=numpy.zeros((1,1),dtype="float32"), name="out_Dy_p") out_custom_grads = [recurrent_transform.layer.shared(value=numpy.zeros([1] * var.ndim, dtype="float32"), name="out_D_" + var.name) for var in custom_vars] out_state_var_prev_grads = [recurrent_transform.layer.shared(value=numpy.zeros([1] * var.ndim, dtype="float32"), name="out_D_" + var.name) for var in state_vars_prev] updates = [(out_Dy_p, Dy_p)] updates += [(out, out + grad) for out, grad in zip(out_custom_grads, custom_grads)] # we accumulate the custom input grads updates += [(out, grad) for out, grad in zip(out_state_var_prev_grads, state_var_prev_grads)] bwd_fun = theano.function(inputs=[y_p] + custom_vars + state_vars_prev + [Dz_re] + state_var_new_grads_list, outputs=[], updates=updates, on_unused_input="ignore") # Before we can accumulate the custom input grads, we need to initialize them with 0. custom_reset_updates = [(out, T.zeros_like(var)) for out, var in zip(out_custom_grads, custom_vars)] custom_reset_fn = theano.function(inputs=custom_vars, outputs=None, updates=custom_reset_updates) if debug_function_hook: bwd_fun = debug_make_theano_function_wrapper(bwd_fun, "att_%i_bwd" % id(recurrent_transform), debug_function_hook, []) return bwd_fun, custom_reset_fn, out_Dy_p, out_custom_grads + out_state_var_prev_grads
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum)(x, self._filters) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle( *[op_axes.index(axis) for axis in rval_axes]) if cpu: rval = host_from_gpu(rval) return rval
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum)(x, self._filters) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) if cpu: rval = host_from_gpu(rval) return rval
def test_match_full_conv_grad(): # Tests that the gradient of ImageActs with no padding is the same as the # gradient of # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform(-1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size) ).astype('float32'), name='hidacts') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1,2,3,0) theano_rng = MRG_RandomStreams(5 * 10 * 2013) random = theano_rng.normal(size=output_conv2d.shape, dtype=output_conv2d.dtype) projected = (output * random).sum() projected_conv_2d = (output_conv2d * random).sum() grads = T.grad(projected, [hid_acts, filters]) + T.grad(projected_conv_2d, [hid_acts, filters]) f = function([], grads) gi, gf, gi_th, gf_th = f() assert gi.shape == gi_th.shape diff = np.abs(gi - gi_th).max() if diff > 2.9e-6: assert False diff = np.abs(gf - gf_th).max() if diff > 1e-6: raise AssertionError(diff)
def test_match_full_conv(): # Tests that running ImageActs with no padding is the same as running # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels # In other words, if convolution computes H=XK, we now compute # R=HK^T rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform(-1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size) ).astype('float32'), name='hidacts') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1,2,3,0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn("""test_match_full_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""") if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ',output.shape print 'theano shape: ',output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_match_valid_conv(): # Tests that running FilterActs with no padding is the same as running # theano's conv2D in valid mode rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) try: f = function([], [output, output_conv2d]) except: raise KnownFailureTest( "cuda-convnet code depends on an unmerged theano feature.") output, output_conv2d = f() warnings.warn( "test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?" ) if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ', output.shape print 'theano shape: ', output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_match_full_conv_grad(): # Tests that the gradient of ImageActs with no padding is the same as the # gradient of # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform( -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size)).astype('float32'), name='hidacts') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) theano_rng = MRG_RandomStreams(5 * 10 * 2013) random = theano_rng.normal(size=output_conv2d.shape, dtype=output_conv2d.dtype) projected = (output * random).sum() projected_conv_2d = (output_conv2d * random).sum() grads = T.grad(projected, [hid_acts, filters]) + T.grad( projected_conv_2d, [hid_acts, filters]) f = function([], grads) gi, gf, gi_th, gf_th = f() assert gi.shape == gi_th.shape diff = np.abs(gi - gi_th).max() if diff > 2.9e-6: assert False diff = np.abs(gf - gf_th).max() if diff > 1e-6: raise AssertionError(diff)
def test_match_full_conv(): # Tests that running ImageActs with no padding is the same as running # theano's conv2D in full mode after flipping the kernel and tranposing # the output and input channels # In other words, if convolution computes H=XK, we now compute # R=HK^T rng = np.random.RandomState([2013, 1, 29]) batch_size = 2 rows = 6 cols = 7 channels = 3 filter_rows = 5 filter_cols = filter_rows num_filters = 16 hid_acts = shared(rng.uniform( -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size)).astype('float32'), name='hidacts') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(hid_acts) gpu_filters = gpu_from_host(filters) output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7))) output = host_from_gpu(output) images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) # need to tranpose the kernel stack to do imgActs rather than filterActs filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3) # In order to do the transpose operation, we must flip the kernels # But in theano's conv2d, the kernels get flipped anyway # so in this case, we do not flip the kernel output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn( """test_match_full_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ', output.shape print 'theano shape: ', output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_match_grad_valid_conv(): # Tests that weightActs is the gradient of FilterActs # with respect to the weights. for partial_sum in [0, 1, 4]: rng = np.random.RandomState([2012, 10, 9]) batch_size = 3 rows = 7 cols = 9 channels = 8 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32') filters = shared(filters, name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) theano_rng = MRG_RandomStreams(2013 + 1 + 31) coeffs = theano_rng.normal(avg=0., std=1., size=output_conv2d.shape, dtype='float32') cost_conv2d = (coeffs * output_conv2d).sum() weights_grad_conv2d = T.grad(cost_conv2d, filters) cost = (coeffs * output).sum() hid_acts_grad = T.grad(cost, output) weights_grad = WeightActs(partial_sum=partial_sum)( gpu_images, gpu_from_host(hid_acts_grad), as_tensor_variable((4, 4)) )[0] weights_grad = host_from_gpu(weights_grad) f = function([], [output, output_conv2d, weights_grad, weights_grad_conv2d]) output, output_conv2d, weights_grad, weights_grad_conv2d = f() if np.abs(output - output_conv2d).max() > 8e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print('cuda-convnet shape: ', output.shape) print('theano shape: ', output_conv2d.shape) assert False err = np.abs(output - output_conv2d) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (output.min(), output.max())) print('theano value range: ', (output_conv2d.min(), output_conv2d.max())) assert False warnings.warn( "test_match_grad_valid_conv success criterion is not very strict." " Can we verify that this is OK? One possibility is that theano" " is numerically unstable and Alex's code is better. Probably" " theano CPU 64 bit is OK but it's worth checking the others.") if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6: if type(weights_grad) != type(weights_grad_conv2d): raise AssertionError("weights_grad is of type " + str(weights_grad)) assert weights_grad.dtype == weights_grad_conv2d.dtype if weights_grad.shape != weights_grad_conv2d.shape: print('cuda-convnet shape: ', weights_grad.shape) print('theano shape: ', weights_grad_conv2d.shape) assert False err = np.abs(weights_grad - weights_grad_conv2d) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (weights_grad.min(), weights_grad.max())) print('theano value range: ', (weights_grad_conv2d.min(), weights_grad_conv2d.max())) assert False
def test_grad(): rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) # Proper random projection, like verify_grad does. cost_weights = rng.normal(size=(num_filters, rows - filter_rows + 1, cols - filter_cols + 1, batch_size)) cost = (constant(cost_weights) * output).sum() images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) # XXX: use verify_grad images_grad, filters_grad = grad(cost.sum(), [images, filters]) reference_cost = (constant(cost_weights) * output_conv2d).sum() images_conv2d_grad, filters_conv2d_grad = grad(reference_cost, [images, filters]) f = function( [], [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad]) images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) # XXX: Refactor if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5: print "=== IMAGES GRADIENT ===" assert type(images_grad) == type(images_conv2d_grad) assert images_grad.dtype == images_conv2d_grad.dtype if images_grad.shape != images_conv2d_grad.shape: print 'cuda-convnet shape: ', images_grad.shape print 'theano shape: ', images_conv2d_grad.shape assert False err = np.abs(images_grad - images_conv2d_grad) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (images_grad.min(), images_grad.max()) print 'theano value range: ', (images_conv2d_grad.min(), images_conv2d_grad.max()) assert False if np.abs(filters_grad - filters_conv2d_grad).max() > 1.15e-5: print "=== FILTERS GRADIENT ===" assert type(filters_grad) == type(filters_conv2d_grad) assert filters_grad.dtype == filters_conv2d_grad.dtype if filters_grad.shape != filters_conv2d_grad.shape: print 'cuda-convnet shape: ', filters_grad.shape print 'theano shape: ', filters_conv2d_grad.shape assert False err = np.abs(filters_grad - filters_conv2d_grad) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (filters_grad.min(), filters_grad.max()) print 'theano value range: ', (filters_conv2d_grad.min(), filters_conv2d_grad.max()) assert False
def test_grad_strided(): rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 9 cols = 9 channels = 3 filter_rows = 3 filter_cols = filter_rows num_filters = 16 stride = 3 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid', subsample=(stride, stride)) output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) checker = function([], [output, output_conv2d]) output_numpy, output_conv2d_numpy = checker() if output_numpy.shape != output_conv2d_numpy.shape: raise AssertionError( "theano and cuda convnet follow different conventions for this input size, so we can't test cuda convnet by matching it against theano for these inputs" ) # Proper random projection, like verify_grad does. theano_rng = MRG_RandomStreams(2013 * 5 * 4) cost_weights = theano_rng.normal(size=output_conv2d.shape, dtype=output_conv2d.dtype) cost = (cost_weights * output).sum() # XXX: use verify_grad images_grad, filters_grad = grad(cost, [images, filters]) reference_cost = (cost_weights * output_conv2d).sum() images_conv2d_grad, filters_conv2d_grad = grad(reference_cost, [images, filters]) f = function( [], [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad]) images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) # XXX: Refactor if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5: print "=== IMAGES GRADIENT ===" assert type(images_grad) == type(images_conv2d_grad) assert images_grad.dtype == images_conv2d_grad.dtype if images_grad.shape != images_conv2d_grad.shape: print 'cuda-convnet shape: ', images_grad.shape print 'theano shape: ', images_conv2d_grad.shape assert False err = np.abs(images_grad - images_conv2d_grad) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (images_grad.min(), images_grad.max()) print 'theano value range: ', (images_conv2d_grad.min(), images_conv2d_grad.max()) assert False if np.abs(filters_grad - filters_conv2d_grad).max() > 1e-5: print "=== FILTERS GRADIENT ===" assert type(filters_grad) == type(filters_conv2d_grad) assert filters_grad.dtype == filters_conv2d_grad.dtype if filters_grad.shape != filters_conv2d_grad.shape: print 'cuda-convnet shape: ', filters_grad.shape print 'theano shape: ', filters_conv2d_grad.shape assert False err = np.abs(filters_grad - filters_conv2d_grad) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (filters_grad.min(), filters_grad.max()) print 'theano value range: ', (filters_conv2d_grad.min(), filters_conv2d_grad.max()) assert False