def __call__(self, persid): from theano.gpuarray.type import get_context from theano.gpuarray import pygpu array_type, name = persid.split(".") if name in self.cache: return self.cache[name] ret = None if array_type == "gpuarray": with self.zip_file.open(name) as f: ctx_name = pickle.load(f) array = np.lib.format.read_array(f) if config.experimental.unpickle_gpu_on_cpu: # directly return numpy array warnings.warn( "config.experimental.unpickle_gpu_on_cpu is set " "to True. Unpickling GpuArray as numpy.ndarray" ) ret = array elif pygpu: ret = pygpu.array(array, context=get_context(ctx_name)) else: raise ImportError("pygpu not found. Cannot unpickle GpuArray") else: with self.zip_file.open(name) as f: ret = np.lib.format.read_array(f) self.cache[name] = ret return ret
def __call__(self, persid): from theano.gpuarray.type import get_context from theano.gpuarray import pygpu array_type, name = persid.split('.') if name in self.cache: return self.cache[name] ret = None if array_type == 'gpuarray': with self.zip_file.open(name) as f: ctx_name = pickle.load(f) array = np.lib.format.read_array(f) if config.experimental.unpickle_gpu_on_cpu: # directly return numpy array warnings.warn("config.experimental.unpickle_gpu_on_cpu is set " "to True. Unpickling GpuArray as numpy.ndarray") ret = array elif pygpu: ret = pygpu.array(array, context=get_context(ctx_name)) else: raise ImportError("pygpu not found. Cannot unpickle GpuArray") else: with self.zip_file.open(name) as f: ret = np.lib.format.read_array(f) self.cache[name] = ret return ret
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices, but A is expected # to be symmetric so it does not matter. # We copy A if needed if self.inplace: L = A else: L = pygpu.array(A, copy=True) # The output matrix will contain only the upper or lower # triangular factorization of A. If L is C ordered (it # probably is as it is the default in Theano) we just switch # the fill mode parameter of cusolver l_parameter = 0 if self.lower else 1 if L.flags['C_CONTIGUOUS']: l_parameter = 1 - l_parameter L_ptr = L.gpudata with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, l_parameter, n, L_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSpotrf(context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('Cholesky decomposition failed (is A SPD?)') # cusolver leaves the elements in the matrix outside the considered # upper or lower triangle unchanged, so we need to put zeros outside # the triangle if self.lower: tril(L) else: triu(L) outputs[0][0] = L
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices, but A is expected # to be symmetric so it does not matter. # We copy A if needed if self.inplace: L = A else: L = pygpu.array(A, copy=True) # The output matrix will contain only the upper or lower # triangular factorization of A. If L is C ordered (it # probably is as it is the default in Theano) we just switch # the fill mode parameter of cusolver l_parameter = 0 if self.lower else 1 if L.flags['C_CONTIGUOUS']: l_parameter = 1 - l_parameter L_ptr = L.gpudata with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, l_parameter, n, L_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSpotrf( context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('Cholesky decomposition failed (is A SPD?)') # cusolver leaves the elements in the matrix outside the considered # upper or lower triangle unchanged, so we need to put zeros outside # the triangle if self.lower: tril(L) else: triu(L) outputs[0][0] = L
def transfer_not_contiguous(shp, dtype): a = numpy.random.rand(*shp) * 10 a = a[::-1] b = pygpu.array(a, context=ctx) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # the result array (c) is C contiguous assert a.strides == b.strides == (-c.strides[0], ) + c.strides[1:] assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous
def transfer_not_contiguous(shp, dtype): a = numpy.random.rand(*shp) * 10 a = a[::-1] b = pygpu.array(a, context=ctx) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # the result array (c) is C contiguous assert a.strides == b.strides == (-c.strides[0],) + c.strides[1:] assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous
def transfer_fortran(shp, dtype): a = numpy.random.rand(*shp) * 10 a_ = numpy.asfortranarray(a) if len(shp) > 1: assert a_.strides != a.strides a = a_ b = pygpu.array(a, context=ctx) c = numpy.asarray(b) assert a.shape == b.shape == c.shape assert a.dtype == b.dtype == c.dtype assert a.flags.f_contiguous assert c.flags.f_contiguous assert a.strides == b.strides == c.strides assert numpy.allclose(c, a)
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices if not self.inplace: LU = pygpu.array(A, copy=True, order='F') else: LU = A.T if A.flags['C_CONTIGUOUS'] else A LU_ptr = LU.gpudata with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, LU_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, LU_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) if self.check_output: val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('LU decomposition failed') outputs[1][0] = pivots outputs[0][0] = LU
def perform(self, node, inputs, outputs): ctx = node.inputs[0].type.context # Solution set x = outputs[0] # Matrix. A = inputs[0] # right hand side b = inputs[1] assert(len(A.shape) == 2) assert(len(b.shape) in [1, 2]) # implicitly deal with the difference between C order # and fortran order by flipping the trans and lower flags lower = not self.lower trans = self.trans if trans in ['T', 'C']: trans = 'N' l, n = A.shape elif trans == 'N': trans = 'T' n, l = A.shape else: raise ValueError('Invalid value for trans') if b.ndim == 2: k, m = b.shape else: k, = b.shape m = 1 if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # solution overwrites right hand side on exit b = pygpu.array(b, copy=True, order='F') A_ptr = A.gpudata b_ptr = b.gpudata # unit scalar used for multiplication alpha = 1.0 # indicates matrix A is on left of B side = 'l' # set whether upper or lower part of matrix A stored uplo = 'l' if lower else 'u' # indicates elements on diagonal of matrix A may not be unity diag = 'n' if A.dtype == 'float32': trsv = cublas.cublasStrsv trsm = cublas.cublasStrsm elif A.dtype == 'float64': trsv = cublas.cublasDtrsv trsm = cublas.cublasDtrsm else: raise ValueError("Unsupported dtype") with ctx: if b.ndim == 1: # matrix vector solve trsv(ctx.cublas_handle, uplo, trans, diag, n, A_ptr, lda, b_ptr, 1) else: trsm(ctx.cublas_handle, side, uplo, trans, diag, n, m, alpha, A_ptr, lda, b_ptr, ldb) x[0] = b
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -(theano._asarray(numpy.arange( numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = pygpu.array(npy_img) kern = pygpu.array(npy_kern) # we take the stride after the transfert as we make c_contiguous # data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] t2 = None rval = True try: t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() i = gftensor4() k = gftensor4() op = GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=mode_with_gpu) gpuval = f(img, kern) t2 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert numpy.allclose(numpy.asarray(gpuval), numpy.asarray(gpuval2)) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) if gpuval.shape != cpuval.shape: print >> sys.stdout, "ERROR: shape mismatch", print >> sys.stdout, gpuval.shape, cpuval.shape rval = False if rval: rval = numpy.allclose(cpuval, gpuval, rtol=rtol) assert numpy.all(numpy.isfinite(gpuval)) except NotImplementedError as e: print >> sys.stdout, '_params_allgood Failed allclose', e rval = False if (t2 is not None): if mode == 'valid': approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2 else: approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2) approx_fp /= 1e6 cpu_mflops = approx_fp / (t1 - t0) gpu_mflops = approx_fp / (t2 - t1) if verbose > 0: print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape), print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)) if not rval: print >> sys.stdout, ('test_' + mode + ' id=' + str(id) + ' FAILED for ishape, kshape, mode, subsample,' + ' img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version) diff = cpuval - gpuval diffabs = numpy.absolute(diff) pr_diff = diffabs / numpy.absolute(cpuval) nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum() print "max absolute diff:", (diffabs.max(), "avg abs diff:", numpy.average(diffabs)) print "median abs diff:", (numpy.median(diffabs), "nb close:", nb_close, "/", diff.size) print "max relatif diff:", (pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)) if not rval and print_ != False: if npy_img.shape[0] > 5: print "img", npy_img[0] print "kern", npy_kern[0] print "gpu", gpuval[0][0] print "cpu", cpuval[0][0] print "diff", diff[0][0] else: print "img", npy_img print "kern", npy_kern print "gpu", gpuval print "cpu", cpuval print "diff", diff return rval
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -(theano._asarray(numpy.arange( numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = pygpu.array(npy_img) kern = pygpu.array(npy_kern) # we take the stride after the transfert as we make c_contiguous # data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] t2 = None rval = True try: t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() i = gftensor4() k = gftensor4() op = GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=mode_with_gpu) gpuval = f(img, kern) t2 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert numpy.allclose(numpy.asarray(gpuval), numpy.asarray(gpuval2)) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) if gpuval.shape != cpuval.shape: print >> sys.stdout, "ERROR: shape mismatch", print >> sys.stdout, gpuval.shape, cpuval.shape rval = False if rval: rval = numpy.allclose(cpuval, gpuval, rtol=rtol) assert numpy.all(numpy.isfinite(gpuval)) except NotImplementedError, e: print >> sys.stdout, '_params_allgood Failed allclose', e rval = False
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if A.dtype == 'float32': potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize potrf = cusolver.cusolverDnSpotrf potrs = cusolverDnSpotrs getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize getrf = cusolver.cusolverDnSgetrf getrs = cusolver.cusolverDnSgetrs elif A.dtype == 'float64': potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize potrf = cusolver.cusolverDnDpotrf potrs = cusolverDnDpotrs getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize getrf = cusolver.cusolverDnDgetrf getrs = cusolver.cusolverDnDgetrs else: raise ValueError("Unsupported dtype") if self.A_structure == 'symmetric': with context: workspace_size = potrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: potrf( context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) potrs( context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = getrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: getrf( context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) getrs( context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def run_noncontiguous_triu(self): a = numpy.random.rand(5, 5) b = pygpu.array(a, context=context) b = b[::-1] assert b.flags.c_contiguous is b.flags.f_contiguous is False triu(b)
def filter_inplace(self, data, old_data, strict=False, allow_downcast=None): if (isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode): # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif (allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX)): if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False, ndmin=len(self.broadcastable)) else: data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context) else: if not hasattr(data, 'dtype'): converted_data = theano._asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(np.asarray(data), converted_data, force_same_dtype=False): data = converted_data up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False) else: data = gpuarray.array(data, dtype=self.dtype, copy=False) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError("Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError("Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) if not isinstance(data, gpuarray.GpuArray): if old_data is not None and old_data.shape == data.shape and ( # write() only work if the destitation is contiguous. old_data.flags['C_CONTIGUOUS'] or old_data.flags['F_CONTIGUOUS']): old_data.write(data) data = old_data else: data = pygpu.array(data, context=self.context) return data
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -( theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = pygpu.array(npy_img) kern = pygpu.array(npy_kern) # we take the stride after the transfert as we make c_contiguous # data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] t2 = None rval = True try: t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() i = gftensor4() k = gftensor4() op = GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=mode_with_gpu) gpuval = f(img, kern) t2 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert numpy.allclose(numpy.asarray(gpuval), numpy.asarray(gpuval2)) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) if gpuval.shape != cpuval.shape: print("ERROR: shape mismatch", end=' ', file=sys.stdout) print(gpuval.shape, cpuval.shape, file=sys.stdout) rval = False if rval: rval = numpy.allclose(cpuval, gpuval, rtol=rtol) assert numpy.all(numpy.isfinite(gpuval)) except NotImplementedError as e: print('_params_allgood Failed allclose', e, file=sys.stdout) rval = False if (t2 is not None): if mode == 'valid': approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2 else: approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2) approx_fp /= 1e6 cpu_mflops = approx_fp / (t1 - t0) gpu_mflops = approx_fp / (t2 - t1) if verbose > 0: print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout) print('%12.5f %7.2f %7.2f %7.1f' % (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout) if not rval: print(('test_' + mode + ' id=' + str(id) + ' FAILED for ishape, kshape, mode, subsample,' + ' img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version), file=sys.stdout) diff = cpuval - gpuval diffabs = numpy.absolute(diff) pr_diff = diffabs / numpy.absolute(cpuval) nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum() print("max absolute diff:", (diffabs.max(), "avg abs diff:", numpy.average(diffabs))) print("median abs diff:", (numpy.median(diffabs), "nb close:", nb_close, "/", diff.size)) print("max relatif diff:", (pr_diff.max(), "avg rel diff:", numpy.average(pr_diff))) if not rval and print_ != False: if npy_img.shape[0] > 5: print("img", npy_img[0]) print("kern", npy_kern[0]) print("gpu", gpuval[0][0]) print("cpu", cpuval[0][0]) print("diff", diff[0][0]) else: print("img", npy_img) print("kern", npy_kern) print("gpu", gpuval) print("cpu", cpuval) print("diff", diff) return rval
def random_array(dtype): dtype = np.dtype(dtype) if dtype == bool: return np.random.randint(low=0, high=2, size=10).astype(bool) elif np.issubsctype(dtype, np.unsignedinteger): return np.random.randint(low=0, high=10, size=10).astype(dtype) elif np.issubsctype(dtype, np.signedinteger): return np.random.randint(low=0, high=10, size=10).astype(dtype) elif np.issubsctype(dtype, np.floating): return np.random.uniform(low=-4, high=4, size=10).astype(dtype) else: raise ValueError('unable to handle dtype {}'.format(dtype)) x_npy = random_array(dtype) x_npy = np.array([0, 1, -1, np.inf, -np.inf, np.nan], dtype=dtype) x_pygpu = pygpu.array(x_npy, dtype=dtype) x_pygpu = pygpu.array([0, 1, -1, np.inf, -np.inf, np.nan], dtype=dtype) ufunc_npy = getattr(np, ufunc) ufunc_pygpu = getattr(pygpu.ufuncs, ufunc) res_npy = ufunc_npy(x_npy) res_pygpu = ufunc_pygpu(x_pygpu) print('=== testing ufunc {} for dtype {} ==='.format(ufunc, dtype)) print('x =', x_npy) print('npy: {}(x) ='.format(ufunc)) print(res_npy) print('pygpu: {}(x) ='.format(ufunc)) print(res_pygpu)
def random_array(dtype): dtype = np.dtype(dtype) if dtype == bool: return np.random.randint(low=0, high=2, size=10).astype(bool) elif np.issubsctype(dtype, np.unsignedinteger): return np.random.randint(low=0, high=10, size=10).astype(dtype) elif np.issubsctype(dtype, np.signedinteger): return np.random.randint(low=0, high=10, size=10).astype(dtype) elif np.issubsctype(dtype, np.floating): return np.random.uniform(low=-4, high=4, size=10).astype(dtype) else: raise ValueError('unable to handle dtype {}'.format(dtype)) x_npy = random_array(dtype) x_pygpu = pygpu.array(x_npy, dtype=dtype) ufunc_npy = getattr(np, ufunc) ufunc_pygpu = getattr(pygpu.ufuncs, ufunc) res_npy = ufunc_npy.reduce(x_npy, axis=axis, keepdims=keepdims) res_pygpu = ufunc_pygpu.reduce(x_pygpu, axis=axis, keepdims=keepdims) print('=== testing reduce of ufunc {} for dtype {} ===' ''.format(ufunc, dtype)) print('x =', x_npy) print('npy: {}.reduce(x) ='.format(ufunc)) print(res_npy) print('pygpu: {}.reduce(x) ='.format(ufunc)) print(res_pygpu)
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -(theano._asarray(numpy.arange( numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = pygpu.array(npy_img) kern = pygpu.array(npy_kern) #we take the stride after the transfert as we make c_contiguous #data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] t2 = None rval = True try: t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() i = gftensor4() k = gftensor4() op = GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=mode_with_gpu) gpuval = f(img, kern) t2 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert numpy.allclose(numpy.asarray(gpuval), numpy.asarray(gpuval2)) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) if gpuval.shape != cpuval.shape: print >> sys.stdout, "ERROR: shape mismatch", print >> sys.stdout, gpuval.shape, cpuval.shape rval = False if rval: rval = numpy.allclose(cpuval, gpuval, rtol=rtol) assert numpy.all(numpy.isfinite(gpuval)) except NotImplementedError, e: print >> sys.stdout, '_params_allgood Failed allclose', e rval = False
def filter_inplace(self, data, old_data, strict=False, allow_downcast=None): if (isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode): # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif (allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX)): if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False, ndmin=len(self.broadcastable)) else: data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context) else: if not hasattr(data, 'dtype'): converted_data = theano._asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(np.asarray(data), converted_data, force_same_dtype=False): data = converted_data up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False) else: data = gpuarray.array(data, dtype=self.dtype, copy=False) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError( "Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError( "Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) if not isinstance(data, gpuarray.GpuArray): if old_data is not None and old_data.shape == data.shape: old_data.write(data) data = old_data else: data = pygpu.array(data, context=self.context) return data
def perform(self, node, inputs, outputs): ctx = node.inputs[0].type.context # Solution set x = outputs[0] # Matrix. A = inputs[0] # right hand side b = inputs[1] assert len(A.shape) == 2 assert len(b.shape) in [1, 2] # implicitly deal with the difference between C order # and fortran order by flipping the trans and lower flags lower = not self.lower trans = self.trans if trans in ["T", "C"]: trans = "N" l, n = A.shape elif trans == "N": trans = "T" n, l = A.shape else: raise ValueError("Invalid value for trans") if b.ndim == 2: k, m = b.shape else: (k, ) = b.shape m = 1 if l != n: raise ValueError("A must be a square matrix") if n != k: raise ValueError("A and b must be aligned.") lda = max(1, n) ldb = max(1, k) # solution overwrites right hand side on exit b = pygpu.array(b, copy=True, order="F") A_ptr = A.gpudata b_ptr = b.gpudata # unit scalar used for multiplication alpha = 1.0 # indicates matrix A is on left of B side = "l" # set whether upper or lower part of matrix A stored uplo = "l" if lower else "u" # indicates elements on diagonal of matrix A may not be unity diag = "n" if A.dtype == "float32": trsv = cublas.cublasStrsv trsm = cublas.cublasStrsm elif A.dtype == "float64": trsv = cublas.cublasDtrsv trsm = cublas.cublasDtrsm else: raise ValueError("Unsupported dtype") with ctx: if b.ndim == 1: # matrix vector solve trsv(ctx.cublas_handle, uplo, trans, diag, n, A_ptr, lda, b_ptr, 1) else: trsm( ctx.cublas_handle, side, uplo, trans, diag, n, m, alpha, A_ptr, lda, b_ptr, ldb, ) x[0] = b
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert len(A.shape) == 2 assert len(b.shape) == 2 if self.trans in ["T", "C"]: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == "N": trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError("Invalid value for trans") if l != n: raise ValueError("A must be a square matrix") if n != k: raise ValueError("A and b must be aligned.") lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order="F") if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags["C_CONTIGUOUS"]: trans = 1 - trans if A.dtype == "float32": potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize potrf = cusolver.cusolverDnSpotrf potrs = cusolverDnSpotrs getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize getrf = cusolver.cusolverDnSgetrf getrs = cusolver.cusolverDnSgetrs elif A.dtype == "float64": potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize potrf = cusolver.cusolverDnDpotrf potrs = cusolverDnDpotrs getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize getrf = cusolver.cusolverDnDgetrf getrs = cusolver.cusolverDnDgetrs else: raise ValueError("Unsupported dtype") if self.A_structure == "symmetric": with context: workspace_size = potrf_bufferSize(context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) dev_info = pygpu.zeros((1, ), dtype="int32", context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: potrf( context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr, ) self.check_dev_info(dev_info) potrs( context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr, ) else: # general case for A with context: workspace_size = getrf_bufferSize(context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context) pivots = pygpu.zeros(n, dtype="int32", context=context) dev_info = pygpu.zeros((1, ), dtype="int32", context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: getrf( context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr, ) self.check_dev_info(dev_info) getrs( context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr, ) z[0] = b
def perform(self, node, inputs, outputs): ctx = node.inputs[0].type.context # Solution set x = outputs[0] # Matrix. A = inputs[0] # right hand side b = inputs[1] assert (len(A.shape) == 2) assert (len(b.shape) in [1, 2]) # implicitly deal with the difference between C order # and fortran order by flipping the trans and lower flags lower = not self.lower trans = self.trans if trans in ['T', 'C']: trans = 'N' l, n = A.shape elif trans == 'N': trans = 'T' n, l = A.shape else: raise ValueError('Invalid value for trans') if b.ndim == 2: k, m = b.shape else: k, = b.shape m = 1 if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # solution overwrites right hand side on exit b = pygpu.array(b, copy=True, order='F') A_ptr = A.gpudata b_ptr = b.gpudata # unit scalar used for multiplication alpha = 1.0 # indicates matrix A is on left of B side = 'l' # set whether upper or lower part of matrix A stored uplo = 'l' if lower else 'u' # indicates elements on diagonal of matrix A may not be unity diag = 'n' with ctx: if b.ndim == 1: # matrix vector solve cublas.cublasStrsv(ctx.cublas_handle, uplo, trans, diag, n, A_ptr, lda, b_ptr, 1) else: cublas.cublasStrsm(ctx.cublas_handle, side, uplo, trans, diag, n, m, alpha, A_ptr, lda, b_ptr, ldb) x[0] = b
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if self.A_structure == 'symmetric': with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSpotrf(context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) cusolverDnSpotrs(context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) cusolver.cusolverDnSgetrs(context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def filter_inplace(self, data, old_data, strict=False, allow_downcast=None): if isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode: # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError(f"{self} expected a GpuArray object.", data, type(data)) if self.typecode != data.typecode: raise TypeError( f"{self} expected typecode {int(self.typecode)} (dtype {self.dtype}), " f"got {int(data.typecode)} (dtype {data.dtype}).") if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX): if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False, ndmin=len(self.broadcastable)) else: data = gpuarray.array( data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context, ) else: if not hasattr(data, "dtype"): converted_data = _asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(np.asarray(data), converted_data, force_same_dtype=False): data = converted_data up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False) else: data = gpuarray.array(data, dtype=self.dtype, copy=False) else: raise TypeError( f"{self} cannot store a value of dtype {data.dtype} " "without risking loss of precision.") if self.ndim != data.ndim: raise TypeError( f"Wrong number of dimensions: expected {self.ndim}, " f"got {data.ndim} with shape {data.shape}.", data, ) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError( "Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable, ) if not isinstance(data, gpuarray.GpuArray): if (old_data is not None and old_data.shape == data.shape and ( # write() only work if the destitation is contiguous. old_data.flags["C_CONTIGUOUS"] or old_data.flags["F_CONTIGUOUS"])): old_data.write(data) data = old_data else: data = pygpu.array(data, context=self.context) return data
def run_noncontiguous_triu(self): a = numpy.random.rand(5, 5) a = a[::-1] b = pygpu.array(a, context=context) assert b.flags.c_contiguous is b.flags.f_contiguous is False triu(b)