def exec_z2d(self, idata, odata): """Executes a double-precision complex-to-real, implicitly inverse, cuFFT transform plan. """ err = self._lib.cufftExecZ2D(self.handle, idata, odata) if err: raise CU.error("cufftExecZ2D", err)
def exec_z2z(self, idata, odata, direction): """Executes a double-precision complex-to-complex cuFFT transform plan. """ err = self._lib.cufftExecZ2Z(self.handle, idata, odata, direction) if err: raise CU.error("cufftExecZ2Z", err)
def exec_c2c(self, idata, odata, direction): """Executes a single-precision complex-to-complex cuFFT transform plan. """ err = self._lib.cufftExecC2C(self.handle, idata, odata, direction) if err: raise CU.error("cufftExecC2C", err)
def __init__(self, context, rng_type=CURAND_RNG_PSEUDO_DEFAULT): """Constructor. Parameters: context: CUDA context handle or None to use the host generator. rng_type: type of the random generator. """ self._context = context self._lib = None if context is not None: context._add_ref(self) initialize() handle = ffi.new("curandGenerator_t *") if context is not None: with context: err = lib.curandCreateGenerator(handle, int(rng_type)) else: err = lib.curandCreateGeneratorHost(handle, int(rng_type)) if err: self._handle = None raise CU.error("curandCreateGenerator" if context is not None else "curandCreateGeneratorHost", err) self._lib = lib # to hold the reference self._handle = int(handle[0]) self._rng_type = int(rng_type) self._seed = 0 self._offset = 0 self._ordering = 0 self._dimensions = 0
def exec_d2z(self, idata, odata): """Executes a double-precision real-to-complex, implicitly forward, cuFFT transform plan. """ err = self._lib.cufftExecD2Z(self.handle, idata, odata) if err: raise CU.error("cufftExecD2Z", err)
def convolution_backward_data( self, alpha, filter_desc, filter_data, diff_desc, diff_data, conv_desc, beta, grad_desc, grad_data, algo=None, workspace=None, workspace_size=0): """Computes backpropagated error. Parameters: alpha: diff_data multiplier (numpy array with one element). beta: grad_data multiplier (numpy array with one element). filter_data: convolutional kernels. diff_data: error for backpropagation. grad_data: backpropagated error. """ if self.version < 4000: err = self._lib.cudnnConvolutionBackwardData( self.handle, CU.extract_ptr(alpha), filter_desc, filter_data, diff_desc, diff_data, conv_desc, CU.extract_ptr(beta), grad_desc, grad_data) elif algo is None: err = self._lib.cudnnConvolutionBackwardData_v2( self.handle, CU.extract_ptr(alpha), filter_desc, filter_data, diff_desc, diff_data, conv_desc, CU.extract_ptr(beta), grad_desc, grad_data) else: err = self._lib.cudnnConvolutionBackwardData( self.handle, CU.extract_ptr(alpha), filter_desc, filter_data, diff_desc, diff_data, conv_desc, algo, workspace, workspace_size, CU.extract_ptr(beta), grad_desc, grad_data) if err: raise CU.error("cudnnConvolutionBackwardData", err)
def convolution_backward_filter( self, alpha, src_desc, src_data, diff_desc, diff_data, conv_desc, beta, grad_desc, grad_data, algo=None, workspace=None, workspace_size=0): """Computes gradient for the convolutional kernels. Parameters: alpha: src_data multiplier (numpy array with one element). beta: grad_data multiplier (numpy array with one element). src_data: input from the forward pass. diff_data: error for backpropagation. grad_data: gradient for convolutional kernels. """ if self.version < 4000: err = self._lib.cudnnConvolutionBackwardFilter( self.handle, CU.extract_ptr(alpha), src_desc, src_data, diff_desc, diff_data, conv_desc, CU.extract_ptr(beta), grad_desc, grad_data) elif algo is None: err = self._lib.cudnnConvolutionBackwardFilter_v2( self.handle, CU.extract_ptr(alpha), src_desc, src_data, diff_desc, diff_data, conv_desc, CU.extract_ptr(beta), grad_desc, grad_data) else: err = self._lib.cudnnConvolutionBackwardFilter( self.handle, CU.extract_ptr(alpha), src_desc, src_data, diff_desc, diff_data, conv_desc, algo, workspace, workspace_size, CU.extract_ptr(beta), grad_desc, grad_data) if err: raise CU.error("cudnnConvolutionBackwardFilter", err)
def workarea(self, value): """Sets workarea for plan execution. """ err = self._lib.cufftSetWorkArea(self.handle, value) if err: raise CU.error("cufftSetWorkArea", err) self._workarea = value
def ordering(self, value): """Sets generator ordering. """ err = self._lib.curandSetGeneratorOrdering(self.handle, int(value)) if err: raise CU.error("curandSetGeneratorOrdering", err) self._ordering = int(value)
def offset(self, value): """Sets generator offset as an 64-bit integer. """ err = self._lib.curandSetGeneratorOffset(self.handle, int(value)) if err: raise CU.error("curandSetGeneratorOffset", err) self._offset = int(value)
def __init__(self, context, rng_type=CURAND_RNG_PSEUDO_DEFAULT): """Constructor. Parameters: context: CUDA context handle or None to use the host generator. rng_type: type of the random generator. """ self._context = context self._lib = None if context is not None: context._add_ref(self) initialize() handle = ffi.new("curandGenerator_t *") if context is not None: with context: err = lib.curandCreateGenerator(handle, int(rng_type)) else: err = lib.curandCreateGeneratorHost(handle, int(rng_type)) if err: self._handle = None raise CU.error( "curandCreateGenerator" if context is not None else "curandCreateGeneratorHost", err) self._lib = lib # to hold the reference self._handle = int(handle[0]) self._rng_type = int(rng_type) self._seed = 0 self._offset = 0 self._ordering = 0 self._dimensions = 0
def dimensions(self, value): """Sets quasirandom generator dimensions. """ err = self._lib.curandSetQuasiRandomGeneratorDimensions( self.handle, int(value)) if err: raise CU.error("curandSetQuasiRandomGeneratorDimensions", err) self._dimensions = int(value)
def seed(self, value): """Sets generator seed as an 64-bit integer. """ err = self._lib.curandSetPseudoRandomGeneratorSeed( self.handle, int(value)) if err: raise CU.error("curandSetPseudoRandomGeneratorSeed", err) self._seed = int(value)
def offset(self, value): """Sets generator offset as an 64-bit integer. """ err = self._lib.curandSetGeneratorOffset( self.handle, int(value)) if err: raise CU.error("curandSetGeneratorOffset", err) self._offset = int(value)
def ordering(self, value): """Sets generator ordering. """ err = self._lib.curandSetGeneratorOrdering( self.handle, int(value)) if err: raise CU.error("curandSetGeneratorOrdering", err) self._ordering = int(value)
def version(self): """Returns cuFFT version. """ version = ffi.new("int *") err = self._lib.cufftGetVersion(version) if err: raise CU.error("cufftGetVersion", err) return int(version[0])
def size(self): """Returns actual size of the work area required to support the plan. """ sz = ffi.new("size_t[]", 4) err = self._lib.cufftGetSize(self.handle, sz) if err: raise CU.error("cufftGetSize", err) return int(sz[0])
def get_pooling_2d_forward_output_dim(pooling_desc, input_desc): """Returns tuple of n, c, h, w for an output. """ n, c, h, w = (ffi.new("int *") for _ in range(4)) err = lib.cudnnGetPooling2dForwardOutputDim( pooling_desc, input_desc, n, c, h, w) if err: raise CU.error("cudnnGetPooling2dForwardOutputDim", err) return int(n[0]), int(c[0]), int(h[0]), int(w[0])
def make_plan_many(self, xyz, batch, fft_type, inembed=None, istride=1, idist=0, onembed=None, ostride=1, odist=0): """Makes 1, 2 or 3 dimensional FFT plan. Parameters: xyz: tuple of dimensions. batch: number of FFTs to make. fft_type: type of FFT (CUFFT_R2C, CUFFT_C2R etc.). inembed: tuple with storage dimensions of the input data in memory (can be None). istride: distance between two successive input elements in the least significant (i.e., innermost) dimension. idist: distance between the first element of two consecutive signals in a batch of the input data. onembed: tuple with storage dimensions of the output data in memory (can be None). ostride: distance between two successive output elements in the least significant (i.e., innermost) dimension. odist: distance between the first element of two consecutive signals in a batch of the output data. Will assign self.execute based on fft_type. Returns: Required work size. """ rank = len(xyz) n = ffi.new("int[]", rank) n[0:rank] = xyz if inembed is None: _inembed = ffi.NULL else: _inembed = ffi.new("int[]", rank) _inembed[0:rank] = inembed if onembed is None: _onembed = ffi.NULL else: _onembed = ffi.new("int[]", rank) _onembed[0:rank] = onembed sz = ffi.new("size_t[]", 4) err = self._lib.cufftMakePlanMany(self.handle, rank, n, _inembed, istride, idist, _onembed, ostride, odist, fft_type, batch, sz) if err: raise CU.error("cufftMakePlanMany", err) self.execute = { CUFFT_R2C: self.exec_r2c, CUFFT_C2R: self.exec_c2r, CUFFT_C2C: self.exec_c2c, CUFFT_D2Z: self.exec_d2z, CUFFT_Z2D: self.exec_z2d, CUFFT_Z2Z: self.exec_z2z }.get(fft_type, self._exec_unknown) return int(sz[0])
def get_convolution_2d_forward_output_dim(conv_desc, input_desc, filter_desc): """Returns tuple of n, c, h, w for an output. """ n, c, h, w = (ffi.new("int *") for _ in range(4)) err = lib.cudnnGetConvolution2dForwardOutputDim( conv_desc, input_desc, filter_desc, n, c, h, w) if err: raise CU.error("cudnnGetConvolution2dForwardOutputDim", err) return int(n[0]), int(c[0]), int(h[0]), int(w[0])
def set_pointer_mode(self, mode=CUBLAS_POINTER_MODE_DEVICE): """Sets the pointer mode used by the cuBLAS library. Parameters: mode: CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE (the default cuBLAS mode is CUBLAS_POINTER_MODE_HOST). """ err = self._lib.cublasSetPointerMode_v2(self.handle, mode) if err: raise CU.error("cublasSetPointerMode_v2", err)
def get_convolution_forward_workspace_size( self, src_desc, filter_desc, conv_dec, dest_desc, algo): """Returns required size of the additional temporary buffer for the specified forward convolution algorithm. """ size = ffi.new("size_t *") err = self._lib.cudnnGetConvolutionForwardWorkspaceSize( self.handle, src_desc, filter_desc, conv_dec, dest_desc, algo, size) if err: raise CU.error("cudnnGetConvolutionForwardWorkspaceSize", err) return int(size[0])
def dgemm(self, transA, transB, rowsCountA, columnCountB, commonSideLength, alpha, A, B, beta, C, strideA=0, strideB=0, strideC=0): """Double precision (double) GEneral Matrix Multiplication. Matrices are always in column order. C = alpha * dot(A, B) + beta * C C = alpha * dot(A^T, B) + beta * C C = alpha * dot(A, B^T) + beta * C C = alpha * dot(A^T, B^T) + beta * C alpha, A, B, beta, C can be numpy array, Memory object, cffi pointer or int. Parameters: transA: how matrix A is to be transposed (CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C). transB: how matrix B is to be transposed (CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C). rowsCountA: number of rows in matrix A. columnCountB: number of columns in matrix B. commonSideLength: length of the common side of the matrices. alpha: the factor of matrix A. A: matrix A. B: matrix B. beta: the factor of matrix C. C: Buffer object storing matrix C. strideA: leading dimension of matrix A: clblasTrans: >= commonSideLength, else: >= rowsCountA. strideB: leading dimension of matrix B: clblasTrans: >= columnCountB, else: >= commonSideLength. strideC: leading dimension of matrix C: >= rowsCountA. Returns: None. """ if not strideA: strideA = commonSideLength if transA != CUBLAS_OP_N else rowsCountA if not strideB: strideB = (columnCountB if transB != CUBLAS_OP_N else commonSideLength) if not strideC: strideC = rowsCountA err = self._lib.cublasDgemm_v2( self.handle, transA, transB, rowsCountA, columnCountB, commonSideLength, CU.extract_ptr(alpha), A, strideA, B, strideB, CU.extract_ptr(beta), C, strideC) if err: raise CU.error("cublasDgemm_v2", err)
def get_convolution_forward_algorithm( self, src_desc, filter_desc, conv_dec, dest_desc, preference=CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, memory_limit=0): """Returns forward algorithm based on parameters. """ algo = ffi.new("cudnnConvolutionFwdAlgo_t *") err = self._lib.cudnnGetConvolutionForwardAlgorithm( self.handle, src_desc, filter_desc, conv_dec, dest_desc, preference, memory_limit, algo) if err: raise CU.error("cudnnGetConvolutionForwardAlgorithm", err) return int(algo[0])
def __init__(self, context): self._context = context self._lib = None context._add_ref(self) initialize() handle = ffi.new("cublasHandle_t *") with context: err = lib.cublasCreate_v2(handle) if err: self._handle = None raise CU.error("cublasCreate_v2", err) self._lib = lib # to hold the reference self._handle = handle[0]
def set_2d(self, window_hw, padding_vh, stride_vh, mode=CUDNN_POOLING_MAX): """Initializes tensor descriptor into a 4D tensor. Parameters: window_hw: tuple of ints for pooling window (height, width). padding_vh: tuple for padding (vertical, horizontal). stride_vh: tuple for stride (vertical, horizontal). mode: pooling mode. """ err = self._lib.cudnnSetPooling2dDescriptor( self.handle, mode, window_hw[0], window_hw[1], padding_vh[0], padding_vh[1], stride_vh[0], stride_vh[1]) if err: raise CU.error("cudnnSetPooling2dDescriptor", err)
def set_4d(self, data_type, k, c, h, w): """Initializes tensor descriptor into a 4D tensor. Parameters: data_type: CUDNN_DATA_FLOAT or CUDNN_DATA_DOUBLE. k: number of kernels. c: number of image channels. h: image height. w: image width. """ err = self._lib.cudnnSetFilter4dDescriptor( self.handle, data_type, k, c, h, w) if err: raise CU.error("cudnnSetFilter4dDescriptor", err)
def transform_tensor(self, alpha, src_desc, src_data, beta, dest_desc, dest_data): """Transforms data from one layout to another (interleaved to splitted for example). Parameters: alpha: src_data multiplier (numpy array with one element). beta: dest_data multiplier (numpy array with one element). """ err = self._lib.cudnnTransformTensor( self.handle, CU.extract_ptr(alpha), src_desc, src_data, CU.extract_ptr(beta), dest_desc, dest_data) if err: raise CU.error("cudnnTransformTensor", err)
def __init__(self, context): self._context = context self._lib = None context._add_ref(self) initialize() self.version = int(lib.cudnnGetVersion()) handle = ffi.new("cudnnHandle_t *") with context: err = lib.cudnnCreate(handle) if err: self._handle = None raise CU.error("cudnnCreate", err) self._lib = lib # to hold the reference self._handle = int(handle[0])
def pooling_forward(self, pooling_desc, alpha, src_desc, src_data, beta, dest_desc, dest_data): """Does pooling forward propagation. Parameters: alpha: src_data multiplier (numpy array with one element). beta: dest_data multiplier (numpy array with one element). """ err = self._lib.cudnnPoolingForward( self.handle, pooling_desc, CU.extract_ptr(alpha), src_desc, src_data, CU.extract_ptr(beta), dest_desc, dest_data) if err: raise CU.error("cudnnPoolingForward", err)
def generate64(self, dst, count=None): """Generates specified number of 64-bit random values. Valid only for 64-bit generators. Parameters: dst: buffer to store the results or numpy array in case of host generator. count: number of 64-bit values to put to dst or None to fill full dst when the it's size is available. """ dst, count = self._extract_ptr_and_count(dst, count, 8) err = self._lib.curandGenerateLongLong(self.handle, dst, count) if err: raise CU.error("curandGenerateLongLong", err)
def generate_uniform_double(self, dst, count=None): """Generates specified number of 64-bit uniformly distributed floats. Will generate values in range (0, 1]. Parameters: dst: buffer to store the results or numpy array in case of host generator. count: number of 64-bit floats to put to dst or None to fill full dst when the it's size is available. """ dst, count = self._extract_ptr_and_count(dst, count, 8) err = self._lib.curandGenerateUniformDouble(self.handle, dst, count) if err: raise CU.error("curandGenerateUniformDouble", err)
def generate_uniform(self, dst, count=None): """Generates specified number of 32-bit uniformly distributed floats. Will generate values in range (0, 1]. Parameters: dst: buffer to store the results or numpy array in case of host generator. count: number of 32-bit floats to put to dst or None to fill full dst when the it's size is available. """ dst, count = self._extract_ptr_and_count(dst, count, 4) err = self._lib.curandGenerateUniform(self.handle, dst, count) if err: raise CU.error("curandGenerateUniform", err)
def set_4d(self, fmt, data_type, n, c, h, w): """Initializes tensor descriptor into a 4D tensor. Parameters: fmt: CUDNN_TENSOR_NCHW or CUDNN_TENSOR_NHWC. data_type: CUDNN_DATA_FLOAT or CUDNN_DATA_DOUBLE. n: number of images. c: number of image channels. h: image height. w: image width. """ err = self._lib.cudnnSetTensor4dDescriptor( self.handle, fmt, data_type, n, c, h, w) if err: raise CU.error("cudnnSetTensor4dDescriptor", err)
def convolution_backward_bias(self, alpha, src_desc, src_data, beta, dest_desc, dest_data): """Computes gradient for the bias. Parameters: alpha: src_data multiplier (numpy array with one element). beta: dest_data multiplier (numpy array with one element). src_data: error for backpropagation. dest_data: gradient for the bias. """ err = self._lib.cudnnConvolutionBackwardBias( self.handle, CU.extract_ptr(alpha), src_desc, src_data, CU.extract_ptr(beta), dest_desc, dest_data) if err: raise CU.error("cudnnConvolutionBackwardBias", err)
def __init__(self, context): self._context = context self._lib = None context._add_ref(self) initialize() handle = ffi.new("cufftHandle *") with context: err = lib.cufftCreate(handle) if err: self._handle = None raise CU.error("cufftCreate", err) self._lib = lib # to hold the reference self._handle = int(handle[0]) self._auto_allocation = True self._workarea = None self.execute = self._exec_unknown
def generate_normal_double(self, dst, count=None, mean=0.0, stddev=1.0): """Generates specified number of 64-bit normally distributed floats. Parameters: dst: buffer to store the results or numpy array in case of host generator. count: number of 64-bit floats to put to dst or None to fill full dst when the it's size is available. mean: mean of normal distribution to generate. stddev: stddev of normal distribution to generate. """ dst, count = self._extract_ptr_and_count(dst, count, 8) err = self._lib.curandGenerateNormalDouble(self.handle, dst, count, float(mean), float(stddev)) if err: raise CU.error("curandGenerateNormalDouble", err)
def generate_poisson(self, dst, count=None, lam=1.0): """Generates specified number of 32-bit unsigned int point values with Poisson distribution. Parameters: dst: buffer to store the results or numpy array in case of host generator. count: number of 32-bit unsigned ints to put to dst or None to fill full dst when the it's size is available. lam: lambda value of Poisson distribution. """ dst, count = self._extract_ptr_and_count(dst, count, 4) err = self._lib.curandGeneratePoisson(self.handle, dst, count, float(lam)) if err: raise CU.error("curandGeneratePoisson", err)
def auto_allocation(self, value): alloc = bool(value) err = self._lib.cufftSetAutoAllocation(self.handle, alloc) if err: raise CU.error("cufftSetAutoAllocation", err) self._auto_allocation = alloc
def sgemm_ex(self, transA, transB, rowsCountA, columnCountB, commonSideLength, alpha, A, B, beta, C, strideA=0, strideB=0, strideC=0, dtypeA=CUBLAS_DATA_HALF, dtypeB=CUBLAS_DATA_HALF, dtypeC=CUBLAS_DATA_HALF): """Single precision (float) GEneral Matrix Multiplication with support of different data types for each matrix. Matrices are always in column order. C = alpha * dot(A, B) + beta * C C = alpha * dot(A^T, B) + beta * C C = alpha * dot(A, B^T) + beta * C C = alpha * dot(A^T, B^T) + beta * C alpha, A, B, beta, C can be numpy array, Memory object, cffi pointer or int. Parameters: transA: how matrix A is to be transposed (CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C). transB: how matrix B is to be transposed (CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C). rowsCountA: number of rows in matrix A. columnCountB: number of columns in matrix B. commonSideLength: length of the common side of the matrices. alpha: the factor of matrix A. A: matrix A. B: matrix B. beta: the factor of matrix C. C: Buffer object storing matrix C. strideA: leading dimension of matrix A: clblasTrans: >= commonSideLength, else: >= rowsCountA. strideB: leading dimension of matrix B: clblasTrans: >= columnCountB, else: >= commonSideLength. strideC: leading dimension of matrix C: >= rowsCountA. dtypeA: data type of matrix A (CUBLAS_DATA_FLOAT, CUBLAS_DATA_DOUBLE, CUBLAS_DATA_HALF, CUBLAS_DATA_INT8). dtypeB: data type of matrix B (CUBLAS_DATA_FLOAT, CUBLAS_DATA_DOUBLE, CUBLAS_DATA_HALF, CUBLAS_DATA_INT8). dtypeC: data type of matrix C (CUBLAS_DATA_FLOAT, CUBLAS_DATA_DOUBLE, CUBLAS_DATA_HALF, CUBLAS_DATA_INT8). Returns: None. """ if not strideA: strideA = commonSideLength if transA != CUBLAS_OP_N else rowsCountA if not strideB: strideB = (columnCountB if transB != CUBLAS_OP_N else commonSideLength) if not strideC: strideC = rowsCountA err = self._lib.cublasSgemmEx(self.handle, transA, transB, rowsCountA, columnCountB, commonSideLength, CU.extract_ptr(alpha), A, dtypeA, strideA, B, dtypeB, strideB, CU.extract_ptr(beta), C, dtypeC, strideC) if err: raise CU.error("cublasSgemmEx", err)
def dgemm(self, transA, transB, rowsCountA, columnCountB, commonSideLength, alpha, A, B, beta, C, strideA=0, strideB=0, strideC=0): """Double precision (double) GEneral Matrix Multiplication. Matrices are always in column order. C = alpha * dot(A, B) + beta * C C = alpha * dot(A^T, B) + beta * C C = alpha * dot(A, B^T) + beta * C C = alpha * dot(A^T, B^T) + beta * C alpha, A, B, beta, C can be numpy array, Memory object, cffi pointer or int. Parameters: transA: how matrix A is to be transposed (CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C). transB: how matrix B is to be transposed (CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C). rowsCountA: number of rows in matrix A. columnCountB: number of columns in matrix B. commonSideLength: length of the common side of the matrices. alpha: the factor of matrix A. A: matrix A. B: matrix B. beta: the factor of matrix C. C: Buffer object storing matrix C. strideA: leading dimension of matrix A: clblasTrans: >= commonSideLength, else: >= rowsCountA. strideB: leading dimension of matrix B: clblasTrans: >= columnCountB, else: >= commonSideLength. strideC: leading dimension of matrix C: >= rowsCountA. Returns: None. """ if not strideA: strideA = commonSideLength if transA != CUBLAS_OP_N else rowsCountA if not strideB: strideB = (columnCountB if transB != CUBLAS_OP_N else commonSideLength) if not strideC: strideC = rowsCountA err = self._lib.cublasDgemm_v2(self.handle, transA, transB, rowsCountA, columnCountB, commonSideLength, CU.extract_ptr(alpha), A, strideA, B, strideB, CU.extract_ptr(beta), C, strideC) if err: raise CU.error("cublasDgemm_v2", err)