def _init_gaussian(self, sigma): """Create a buffer of the right size according to the width of the gaussian ... :param sigma: width of the gaussian, the length of the function will be 8*sigma + 1 Same calculation done on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32) gaussian /= gaussian.sum(dtype=numpy.float32) """ pyopencl.enqueue_barrier(self.queue).wait() name = "gaussian_%s" % sigma size = kernel_size(sigma, True) wg_size = nextpower(size) logger.info("Allocating %s float for blur sigma: %s. wg=%s max_wg=%s", size, sigma, wg_size, self.block_size) wg1 = self.kernels_wg["gaussian"] if wg1 >= wg_size: gaussian_gpu = pyopencl.array.empty(self.queue, size, dtype=numpy.float32) pyopencl.enqueue_barrier(self.queue).wait() kernel = self.kernels.get_kernel("gaussian") shm1 = pyopencl.LocalMemory(4 * wg_size) shm2 = pyopencl.LocalMemory(4 * wg_size) evt = kernel( self.queue, (wg_size, ), (wg_size, ), gaussian_gpu.data, numpy.float32(sigma), # const float sigma, numpy.int32(size), # const int SIZE shm1, shm2) # some shared memory pyopencl.enqueue_barrier(self.queue).wait() if self.profile: self.events.append(("gaussian %s" % sigma, evt)) else: logger.info( "Workgroup size error: gaussian wg: %s < max_work_group_size: %s", wg1, self.block_size) # common bug on OSX when running on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaus = numpy.exp(-(x / sigma)**2 / 2.0).astype(numpy.float32) gaus /= gaus.sum(dtype=numpy.float32) gaussian_gpu = pyopencl.array.to_device(self.queue, gaus) self.cl_mem[name] = gaussian_gpu return gaussian_gpu
def test_orientation(self): ''' #tests keypoints orientation assignment kernel ''' if self.abort: return # orientation_setup : keypoints, nb_keypoints, updated_nb_keypoints, grad, ori, octsize = orientation_setup( ) keypoints, compact_cnt = my_compact(numpy.copy(keypoints), nb_keypoints) updated_nb_keypoints = compact_cnt logger.info("Number of keypoints before orientation assignment : %s", updated_nb_keypoints) # Prepare kernel call wg = self.wg_orient kernel = self.program_orient.all_kernels()[0] max_wg = kernel_workgroup_size(self.program_orient, kernel) if max_wg < wg[0]: logger.warning( "test_orientation: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return shape = keypoints.shape[0] * wg[ 0], # shape = calc_size(keypoints.shape, self.wg) gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) actual_nb_keypoints = numpy.int32(updated_nb_keypoints) gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) orisigma = numpy.float32(1.5) # SIFT grad_height, grad_width = numpy.int32(grad.shape) keypoints_start = numpy.int32(0) keypoints_end = numpy.int32(actual_nb_keypoints) counter = pyopencl.array.to_device( self.queue, keypoints_end) # actual_nb_keypoints) kargs = [ gpu_keypoints.data, gpu_grad.data, gpu_ori.data, counter.data, octsize, orisigma, nb_keypoints, keypoints_start, keypoints_end, grad_width, grad_height ] if not self.USE_CPU: kargs += [ pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4) ] # Call the kernel t0 = time.time() k1 = kernel(self.queue, shape, wg, *kargs) res = gpu_keypoints.get() cnt = counter.get() t1 = time.time() # Reference Python implemenattion ref, updated_nb_keypoints = my_orientation(keypoints, nb_keypoints, keypoints_start, keypoints_end, grad, ori, octsize, orisigma) t2 = time.time() # sort to compare added keypoints upbound = min(cnt, updated_nb_keypoints) d1, d2, d3, d4 = keypoints_compare(ref[0:upbound], res[0:upbound]) self.assertLess(d1, 1e-4, "delta_cols=%s" % (d1)) self.assertLess(d2, 1e-4, "delta_rows=%s" % (d2)) self.assertLess(d3, 1e-4, "delta_sigma=%s" % (d3)) self.assertLess(d4, 1e-1, "delta_angle=%s" % (d4)) # orientation has a poor precision logger.info("delta_cols=%s" % d1) logger.info("delta_rows=%s" % d2) logger.info("delta_sigma=%s" % d3) logger.info("delta_angle=%s" % d4) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Orientation assignment took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def _one_octave(self, octave): """ Does all scales within an octave :param octave: number of the octave """ prevSigma = self._init_sigma logger.info("Calculating octave %i" % octave) wgsize = (128,) # (max(self.wgsize[octave]),) #TODO: optimize kpsize32 = numpy.int32(self.kpsize) self._reset_keypoints() octsize = numpy.int32(2 ** octave) last_start = numpy.int32(0) for scale in range(par.Scales + 2): sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0) logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma)) ######################################################################## # Calculate gaussian blur and DoG ######################################################################## self._gaussian_convolution(self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave) prevSigma *= self.sigmaRatio evt = self.kernels.get_kernel("combine")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0), self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0), self.cl_mem["DoGs"].data, numpy.int32(scale), *self.scales[octave]) if self.profile: self.events.append(("DoG %s %s" % (octave, scale), evt)) for scale in range(1, par.Scales + 1): evt = self.kernels.get_kernel("local_maxmin")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* output, numpy.int32(par.BorderDist), # int border_dist, numpy.float32(par.PeakThresh), # float peak_thresh, octsize, # int octsize, numpy.float32(par.EdgeThresh1), # float EdgeThresh0, numpy.float32(par.EdgeThresh), # float EdgeThresh, self.cl_mem["cnt"].data, # __global int* counter, kpsize32, # int nb_keypoints, numpy.int32(scale), # int scale, *self.scales[octave]) # int width, int height) if self.profile: self.events.append(("local_maxmin %s %s" % (octave, scale), evt)) procsize = calc_size((self.kpsize,), wgsize) cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory evt = self.kernels.get_kernel("interp_keypoint")(self.queue, procsize, wgsize, self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, last_start, # int start_keypoint, self.cnt[0], # int end_keypoint, numpy.float32(par.PeakThresh), # float peak_thresh, numpy.float32(self._init_sigma), # float InitSigma, *self.scales[octave]) # int width, int height) if self.profile: self.events += [("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt) ] newcnt = self._compact(last_start) evt = self.kernels.get_kernel("compute_gradient_orientation")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%s" % (scale)].data, # __global float* igray, self.cl_mem["tmp"].data, # __global float *grad, self.cl_mem["ori"].data, # __global float *ori, *self.scales[octave]) # int width,int height if self.profile: self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt)) # Orientation assignement: 1D kernel, rather heavy kernel if newcnt and newcnt > last_start: # launch kernel only if neededwgsize = (128,) if self.USE_CPU: orientation_name = "orientation_cpu" scales = self.scales[octave] else: orientation_name = "orientation_gpu" scales = list(self.scales[octave]) + \ [pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4)] orientation = self.kernels.get_kernel(orientation_name) wg = self.kernels_max_wg_size[orientation_name] wgsize2 = (wg,) procsize = (int(newcnt * wg),) evt = orientation(self.queue, procsize, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, self.cl_mem["cnt"].data, # __global int* counter, octsize, # int octsize, numpy.float32(par.OriSigma), # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)" kpsize32, # int max of nb_keypoints, numpy.int32(last_start), # int keypoints_start, newcnt, # int keypoints_end, *scales) # int grad_width, int grad_height) # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above ! evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) newcnt = self.cnt[0] # do not forget to update numbers of keypoints, modified above ! for _ in range(3): # up to 3 attempts if self.USE_CPU or (self.LOW_END > 1): logger.info("Computing descriptors with CPU optimized kernels") descriptor_name = "descriptor_cpu" wg = self.kernels_max_wg_size[descriptor_name][0] wgsize2 = (wg,) procsize2 = (int(newcnt * wg),) else: if self.LOW_END: logger.info("Computing descriptors with older-GPU optimized kernels") descriptor_name = "descriptor_gpu1" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) # if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # # will fail anyway: # self.LOW_END += 1 # continue else: logger.info("Computing descriptors with newer-GPU optimized kernels") descriptor_name = "descriptor_gpu2" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) # if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # # will fail anyway: # self.LOW_END += 1 # continue try: descriptor = self.kernels.get_kernel(descriptor_name) evt2 = descriptor(self.queue, procsize2, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["descriptors"].data, # ___global unsigned char *descriptors self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, octsize, # int octsize, numpy.int32(last_start), # int keypoints_start, self.cl_mem["cnt"].data, # int* keypoints_end, *self.scales[octave]) # int grad_width, int grad_height) evt2.wait() except (pyopencl.RuntimeError, pyopencl._cl.LogicError) as error: self.LOW_END += 1 logger.error("Descriptor failed with %s. Switching to lower_end mode" % error) continue else: break if self.profile: self.events += [("%s %s %s" % (orientation_name, octave, scale), evt), ("copy cnt D->H", evt_cp), ("%s %s %s" % (descriptor_name, octave, scale), evt2)] evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) last_start = self.cnt[0] if self.profile: self.events.append(("copy cnt D->H", evt_cp)) ######################################################################## # Rescale all images to populate all octaves ######################################################################## if octave < self.octave_max - 1: evt = self.kernels.get_kernel("shrink")(self.queue, self.procsize[octave + 1], self.wgsize[octave + 1], self.cl_mem["scale_%i" % (par.Scales)].data, self.cl_mem["scale_0"].data, numpy.int32(2), numpy.int32(2), self.scales[octave][0], self.scales[octave][1], *self.scales[octave + 1]) if self.profile: self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt)) results = numpy.empty((last_start, 4), dtype=numpy.float32) descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8) if last_start: evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data) evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.cl_mem["descriptors"].data) if self.profile: self.events += [("copy D->H", evt), ("copy D->H", evt2)] return results, descriptors
def keypoints(self, image, mask=None): """Calculates the keypoints of the image TODO: use a temporary list with events and use a single test at the end :param image: ndimage of 2D (or 3D if RGB) :param mask: TODO: implement a mask for sieving out the keypoints :return: vector of keypoint (1D numpy array) """ # self.reset_timer() with self.sem: total_size = 0 keypoints = [] descriptors = [] assert image.shape[:2] == self.shape assert image.dtype in [self.dtype, numpy.float32] # old versions of pyopencl do not check for data contiguity if not(isinstance(image, pyopencl.array.Array)) and not(image.flags["C_CONTIGUOUS"]): image = numpy.ascontiguousarray(image) t0 = time.time() if image.dtype == numpy.float32: if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image) if self.profile: self.events.append(("copy H->D", evt)) elif self.dtype == numpy.float64: # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64) # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32 # and uncomment it if it proves faster (dubious, because of data transfer bottleneck) evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.astype(numpy.float32)) if self.profile: self.events.append(("copy H->D", evt)) elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB): if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = self.kernels.get_kernel("rgb_to_float")(self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0]) if self.profile: self.events.append(("RGB -> float", evt)) elif self.dtype in self.converter: program = self.kernels.get_kernel(self.converter[self.dtype]) evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = program(self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0]) if self.profile: self.events.append(("convert -> float", evt)) else: raise RuntimeError("invalid input format error (%s)" % (str(self.dtype))) wg1 = self.kernels_wg["max_min_global_stage1"] wg2 = self.kernels_wg["max_min_global_stage2"] if min(wg1, wg2) < self.red_size: # common bug on OSX when running on CPU logger.info("Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s", wg1, wg2, self.block_size, self.red_size) kernel = self.kernels.get_kernel("max_min_vec16") k = kernel(self.queue, (1,), (1,), self.cl_mem["scale_0"].data, numpy.int32(self.shape[0] * self.shape[1]), self.cl_mem["max"].data, self.cl_mem["min"].data) if self.profile: self.events.append(("max_min_serial", k)) # python implementation: # buffer_ = self.cl_mem["scale_0"].get() # self.cl_mem["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32)) # self.cl_mem["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32)) else: kernel1 = self.kernels.get_kernel("max_min_global_stage1") kernel2 = self.kernels.get_kernel("max_min_global_stage2") # logger.debug("self.red_size: %s", self.red_size) shm = pyopencl.LocalMemory(self.red_size * 2 * 4) k1 = kernel1(self.queue, (self.red_size * self.red_size,), (self.red_size,), self.cl_mem["scale_0"].data, self.cl_mem["max_min"].data, numpy.int32(self.shape[0] * self.shape[1]), shm) k2 = kernel2(self.queue, (self.red_size,), (self.red_size,), self.cl_mem["max_min"].data, self.cl_mem["max"].data, self.cl_mem["min"].data, shm) if self.profile: self.events.append(("max_min_stage1", k1)) self.events.append(("max_min_stage2", k2)) evt = self.kernels.get_kernel("normalizes")(self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["scale_0"].data, self.cl_mem["min"].data, self.cl_mem["max"].data, self.cl_mem["255"].data, *self.scales[0]) if self.profile: self.events.append(("normalize", evt)) curSigma = 1.0 if par.DoubleImSize else 0.5 octave = 0 if self._init_sigma > curSigma: logger.debug("Bluring image to achieve std: %f", self._init_sigma) sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2) self._gaussian_convolution(self.cl_mem["scale_0"], self.cl_mem["scale_0"], sigma, 0) for octave in range(self.octave_max): kp, descriptor = self._one_octave(octave) logger.info("in octave %i found %i kp" % (octave, kp.shape[0])) if len(kp): # sieve out coordinates with NaNs mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1)))) keypoints.append(kp[mask]) descriptors.append(descriptor[mask]) total_size += len(mask[0]) ######################################################################## # Merge keypoints in central memory ######################################################################## output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp) last = 0 for ds, desc in zip(keypoints, descriptors): l = ds.shape[0] if l > 0: output[last:last + l].x = ds[:, 0] output[last:last + l].y = ds[:, 1] output[last:last + l].scale = ds[:, 2] output[last:last + l].angle = ds[:, 3] output[last:last + l].desc = desc last += l logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0))) return output