def _calc_workgroups(self): """First try to guess the best workgroup size, then calculate all global worksize Nota: The workgroup size is limited by the device, some devices report wrong size. The workgroup size is limited to the 2**n below then image size (hence changes with octaves) The second dimension of the wg size should be large, the first small: i.e. (1,64) The processing size should be a multiple of workgroup size. """ device = self.ctx.devices[0] max_work_item_sizes = device.max_work_item_sizes if self.max_workgroup_size: self.max_workgroup_size = min(max_work_item_sizes[0], self.max_workgroup_size) else: self.max_workgroup_size = max_work_item_sizes[0] # MacOSX driver on CPU usually reports bad workgroup size: this is addressed in ocl self.max_workgroup_size = min( self.max_workgroup_size, ocl.platforms[self.device[0]].devices[ self.device[1]].max_work_group_size) self.kernels = {} for k, v in self.__class__.kernels.items(): if isinstance(v, int): self.kernels[k] = min(v, self.max_workgroup_size) else: # probably a list prod = 1 for i in v: prod *= i if prod <= self.max_workgroup_size: self.kernels[k] = v # else it is not possible to run this kernel. # If the kernel is not present in the dict, it should not be used. wg_float = min(self.max_workgroup_size, numpy.sqrt(self.shape[0] * self.shape[1])) self.red_size = nextpower(wg_float) # we recalculate the shapes ... shape = self.shape min_size = 2 * par.BorderDist + 2 while min(shape) > min_size: wg = (min(nextpower(shape[-1]), self.max_workgroup_size), 1) self.wgsize.append(wg) self.procsize.append(calc_size(shape[-1::-1], wg)) shape = tuple(i // 2 for i in shape)
def _calc_workgroups(self): """First try to guess the best workgroup size, then calculate all global worksize Nota: The workgroup size is limited by the device, some devices report wrong size. The workgroup size is limited to the 2**n below then image size (hence changes with octaves) The second dimension of the wg size should be large, the first small: i.e. (1,64) The processing size should be a multiple of workgroup size. """ device = self.ctx.devices[0] max_work_item_sizes = device.max_work_item_sizes if self.max_workgroup_size: self.max_workgroup_size = min(max_work_item_sizes[0], self.max_workgroup_size) else: self.max_workgroup_size = max_work_item_sizes[0] # MacOSX driver on CPU usually reports bad workgroup size: this is addressed in ocl self.max_workgroup_size = min(self.max_workgroup_size, ocl.platforms[self.device[0]].devices[self.device[1]].max_work_group_size) self.kernels = {} for k, v in self.__class__.kernels.items(): if isinstance(v, int): self.kernels[k] = min(v, self.max_workgroup_size) else: # probably a list prod = 1 for i in v: prod *= i if prod <= self.max_workgroup_size: self.kernels[k] = v # else it is not possible to run this kernel. # If the kernel is not present in the dict, it should not be used. wg_float = min(self.max_workgroup_size, numpy.sqrt(self.shape[0] * self.shape[1])) self.red_size = nextpower(wg_float) # we recalculate the shapes ... shape = self.shape min_size = 2 * par.BorderDist + 2 while min(shape) > min_size: wg = (min(nextpower(shape[-1]), self.max_workgroup_size), 1) self.wgsize.append(wg) self.procsize.append(calc_size(shape[-1::-1], wg)) shape = tuple(i // 2 for i in shape)
def _calc_memory(self): """ Estimates the memory footprint of all buffer to ensure it fits on the device """ # Just the context + kernel takes about 75MB on the GPU self.memory = 75 * 2**20 size_of_float = numpy.dtype(numpy.float32).itemsize size_of_input = numpy.dtype(self.dtype).itemsize # raw images: size = self.shape[0] * self.shape[1] self.memory += size * size_of_input # initial_image (no raw_float) if self.RGB: self.memory += 2 * size * (size_of_input ) # one of three was already counted nr_blur = par.Scales + 3 # 3 blurs and 2 tmp nr_dogs = par.Scales + 2 self.memory += size * (nr_blur + nr_dogs) * size_of_float self.kpsize = int( self.shape[0] * self.shape[1] // self.PIX_PER_KP ) # Is the number of kp independant of the octave ? int64 causes problems with pyopencl self.memory += self.kpsize * size_of_float * 4 * 2 # those are array of float4 to register keypoints, we need two of them self.memory += self.kpsize * 128 # stores the descriptors: 128 unsigned chars self.memory += 4 # keypoint index Counter wg_float = min(self.max_workgroup_size, numpy.sqrt(self.shape[0] * self.shape[1])) self.red_size = nextpower(wg_float) self.memory += 4 * 2 * self.red_size # temporary storage for reduction ######################################################################## # Calculate space for gaussian kernels ######################################################################## curSigma = 1.0 if par.DoubleImSize else 0.5 if self._init_sigma > curSigma: sigma = math.sqrt(self._init_sigma**2 - curSigma**2) size = kernel_size(sigma, True) logger.debug("pre-Allocating %s float for init blur" % size) self.memory += size * size_of_float prevSigma = self._init_sigma for i in range(par.Scales + 2): increase = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0) size = kernel_size(increase, True) logger.debug("pre-Allocating %s float for blur sigma: %s" % (size, increase)) self.memory += size * size_of_float prevSigma *= self.sigmaRatio
def _init_gaussian(self, sigma): """Create a buffer of the right size according to the width of the gaussian ... :param sigma: width of the gaussian, the length of the function will be 8*sigma + 1 Same calculation done on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32) gaussian /= gaussian.sum(dtype=numpy.float32) """ name = "gaussian_%s" % sigma size = kernel_size(sigma, True) wg_size = nextpower(size) logger.info("Allocating %s float for blur sigma: %s. wg=%s max_wg=%s", size, sigma, wg_size, self.max_workgroup_size) wg1 = self.kernels["gaussian.gaussian"] if wg1 >= wg_size: gaussian_gpu = pyopencl.array.empty(self.queue, size, dtype=numpy.float32) evt = self.programs["gaussian"].gaussian( self.queue, (wg_size, ), (wg_size, ), gaussian_gpu.data, # __global float *data, numpy.float32(sigma), # const float sigma, numpy.int32(size)) # const int SIZE if self.profile: self.events.append(("gaussian %s" % sigma, evt)) else: logger.info( "Workgroup size error: gaussian wg: %s < max_work_group_size: %s", wg1, self.max_workgroup_size) #common bug on OSX when running on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaus = numpy.exp(-(x / sigma)**2 / 2.0).astype(numpy.float32) gaus /= gaus.sum(dtype=numpy.float32) gaussian_gpu = pyopencl.array.to_device(self.queue, gaus) self.buffers[name] = gaussian_gpu return gaussian_gpu
def _calc_memory(self): """ Estimates the memory footprint of all buffer to ensure it fits on the device """ # Just the context + kernel takes about 75MB on the GPU self.memory = 75 * 2 ** 20 size_of_float = numpy.dtype(numpy.float32).itemsize size_of_input = numpy.dtype(self.dtype).itemsize # raw images: size = self.shape[0] * self.shape[1] self.memory += size * size_of_input # initial_image (no raw_float) if self.RGB: self.memory += 2 * size * (size_of_input) # one of three was already counted nr_blur = par.Scales + 3 # 3 blurs and 2 tmp nr_dogs = par.Scales + 2 self.memory += size * (nr_blur + nr_dogs) * size_of_float self.kpsize = int(self.shape[0] * self.shape[1] // self.PIX_PER_KP) # Is the number of kp independant of the octave ? int64 causes problems with pyopencl self.memory += self.kpsize * size_of_float * 4 * 2 # those are array of float4 to register keypoints, we need two of them self.memory += self.kpsize * 128 # stores the descriptors: 128 unsigned chars self.memory += 4 # keypoint index Counter wg_float = min(self.max_workgroup_size, numpy.sqrt(self.shape[0] * self.shape[1])) self.red_size = nextpower(wg_float) self.memory += 4 * 2 * self.red_size # temporary storage for reduction ######################################################################## # Calculate space for gaussian kernels ######################################################################## curSigma = 1.0 if par.DoubleImSize else 0.5 if self._init_sigma > curSigma: sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2) size = kernel_size(sigma, True) logger.debug("pre-Allocating %s float for init blur" % size) self.memory += size * size_of_float prevSigma = self._init_sigma for i in range(par.Scales + 2): increase = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0) size = kernel_size(increase, True) logger.debug("pre-Allocating %s float for blur sigma: %s" % (size, increase)) self.memory += size * size_of_float prevSigma *= self.sigmaRatio
def _init_gaussian(self, sigma): """Create a buffer of the right size according to the width of the gaussian ... :param sigma: width of the gaussian, the length of the function will be 8*sigma + 1 Same calculation done on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32) gaussian /= gaussian.sum(dtype=numpy.float32) """ name = "gaussian_%s" % sigma size = kernel_size(sigma, True) wg_size = nextpower(size) logger.info("Allocating %s float for blur sigma: %s. wg=%s max_wg=%s", size, sigma,wg_size, self.max_workgroup_size) wg1 = self.kernels["gaussian.gaussian"] if wg1>=wg_size: gaussian_gpu = pyopencl.array.empty(self.queue, size, dtype=numpy.float32) evt = self.programs["gaussian"].gaussian(self.queue, (wg_size,), (wg_size,), gaussian_gpu.data, # __global float *data, numpy.float32(sigma), # const float sigma, numpy.int32(size)) # const int SIZE if self.profile: self.events.append(("gaussian %s" % sigma, evt)) else: logger.info("Workgroup size error: gaussian wg: %s < max_work_group_size: %s", wg1, self.max_workgroup_size) #common bug on OSX when running on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaus = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32) gaus /= gaus.sum(dtype=numpy.float32) gaussian_gpu = pyopencl.array.to_device(self.queue, gaus) self.buffers[name] = gaussian_gpu return gaussian_gpu