def _compact(self, start=numpy.int32(0)): """ Compact the vector of keypoints starting from start :param start: start compacting at this adress. Before just copy :type start: numpy.int32 """ wgsize = (self.max_workgroup_size,) # (max(self.wgsize[0]),) #TODO: optimize # kpsize32 = numpy.int32(self.kpsize) cp0_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) kp_counter = self.cnt[0] procsize = calc_size((self.kpsize,), wgsize) if kp_counter > 0.9 * self.kpsize: logger.warning("Keypoint counter overflow risk: counted %s / %s" % (kp_counter, self.kpsize)) logger.info("Compact %s -> %s / %s" % (start, kp_counter, self.kpsize)) self.cnt[0] = start cp1_evt = pyopencl.enqueue_copy(self.queue, self.buffers["cnt"].data, self.cnt) evt = self.programs["algebra"].compact( self.queue, procsize, wgsize, self.buffers["Kp_1"].data, # __global keypoint* keypoints, self.buffers["Kp_2"].data, # __global keypoint* output, self.buffers["cnt"].data, # __global int* counter, start, # int start, kp_counter, ) # int nbkeypoints cp2_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) # swap keypoints: self.buffers["Kp_1"], self.buffers["Kp_2"] = self.buffers["Kp_2"], self.buffers["Kp_1"] # memset buffer Kp_2 # self.buffers["Kp_2"].fill(-1, self.queue) mem_evt = self.programs["memset"].memset_float( self.queue, calc_size((4 * self.kpsize,), wgsize), wgsize, self.buffers["Kp_2"].data, numpy.float32(-1), numpy.int32(4 * self.kpsize), ) if self.profile: self.events += [ ("copy cnt D->H", cp0_evt), ("copy cnt H->D", cp1_evt), ("compact", evt), ("copy cnt D->H", cp2_evt), ("memset 2", mem_evt), ] return self.cnt[0]
def _compact(self, start=numpy.int32(0)): """ Compact the vector of keypoints starting from start :param start: start compacting at this adress. Before just copy :type start: numpy.int32 """ wgsize = self.max_workgroup_size, # (max(self.wgsize[0]),) #TODO: optimize # kpsize32 = numpy.int32(self.kpsize) cp0_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) kp_counter = self.cnt[0] procsize = calc_size((self.kpsize, ), wgsize) if kp_counter > 0.9 * self.kpsize: logger.warning("Keypoint counter overflow risk: counted %s / %s" % (kp_counter, self.kpsize)) logger.info("Compact %s -> %s / %s" % (start, kp_counter, self.kpsize)) self.cnt[0] = start cp1_evt = pyopencl.enqueue_copy(self.queue, self.buffers["cnt"].data, self.cnt) evt = self.programs["algebra"].compact( self.queue, procsize, wgsize, self.buffers["Kp_1"].data, # __global keypoint* keypoints, self.buffers["Kp_2"].data, # __global keypoint* output, self.buffers["cnt"].data, # __global int* counter, start, # int start, kp_counter) # int nbkeypoints cp2_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) # swap keypoints: self.buffers["Kp_1"], self.buffers["Kp_2"] = self.buffers[ "Kp_2"], self.buffers["Kp_1"] # memset buffer Kp_2 # self.buffers["Kp_2"].fill(-1, self.queue) mem_evt = self.programs["memset"].memset_float( self.queue, calc_size((4 * self.kpsize, ), wgsize), wgsize, self.buffers["Kp_2"].data, numpy.float32(-1), numpy.int32(4 * self.kpsize)) if self.profile: self.events += [("copy cnt D->H", cp0_evt), ("copy cnt H->D", cp1_evt), ("compact", evt), ("copy cnt D->H", cp2_evt), ("memset 2", mem_evt)] return self.cnt[0]
def align(self, img, shift_only=False, return_all=False, double_check=False, relative=False, orsa=False): """ Align image on reference image :param img: numpy array containing the image to align to reference :param return_all: return in addition ot the image, keypoints, matching keypoints, and transformations as a dict :param reltive: update reference keypoints with those from current image to perform relative alignment :return: aligned image or all informations """ logger.debug("ref_keypoints: %s" % self.ref_kp.size) if self.RGB: data = numpy.ascontiguousarray(img, numpy.uint8) else: data = numpy.ascontiguousarray(img, numpy.float32) with self.sem: cpy = pyopencl.enqueue_copy(self.queue, self.buffers["input"].data, data) if self.profile: self.events.append(("Copy H->D", cpy)) cpy.wait() kp = self.sift.keypoints(self.buffers["input"]) # print("ref %s img %s" % (self.buffers["ref_kp_gpu"].shape, kp.shape)) logger.debug("mod image keypoints: %s" % kp.size) raw_matching = self.match.match(self.buffers["ref_kp_gpu"], kp, raw_results=True) # print(raw_matching.max(axis=0)) matching = numpy.recarray(shape=raw_matching.shape, dtype=MatchPlan.dtype_kp) len_match = raw_matching.shape[0] if len_match == 0: logger.warning("No matching keypoints") return matching[:, 0] = self.ref_kp[raw_matching[:, 0]] matching[:, 1] = kp[raw_matching[:, 1]] if orsa: if feature: matching = feature.sift_orsa(matching, self.shape, 1) else: logger.warning( "feature is not available. No ORSA filtering") if (len_match < 3 * 6) or (shift_only): # 3 points per DOF if shift_only: logger.debug("Shift Only mode: Common keypoints: %s" % len_match) else: logger.warning("Shift Only mode: Common keypoints: %s" % len_match) dx = matching[:, 1].x - matching[:, 0].x dy = matching[:, 1].y - matching[:, 0].y matrix = numpy.identity(2, dtype=numpy.float32) offset = numpy.array([+numpy.median(dy), +numpy.median(dx)], numpy.float32) else: logger.debug("Common keypoints: %s" % len_match) transform_matrix = matching_correction(matching) offset = numpy.array( [transform_matrix[5], transform_matrix[2]], dtype=numpy.float32) matrix = numpy.empty((2, 2), dtype=numpy.float32) matrix[0, 0], matrix[0, 1] = transform_matrix[4], transform_matrix[3] matrix[1, 0], matrix[1, 1] = transform_matrix[1], transform_matrix[0] if double_check and ( len_match >= 3 * 6): # and abs(matrix - numpy.identity(2)).max() > 0.1: logger.warning("Validating keypoints, %s,%s" % (matrix, offset)) dx = matching[:, 1].x - matching[:, 0].x dy = matching[:, 1].y - matching[:, 0].y dangle = matching[:, 1].angle - matching[:, 0].angle dscale = numpy.log(matching[:, 1].scale / matching[:, 0].scale) distance = numpy.sqrt(dx * dx + dy * dy) outlayer = numpy.zeros(distance.shape, numpy.int8) outlayer += abs( (distance - distance.mean()) / distance.std()) > 4 outlayer += abs((dangle - dangle.mean()) / dangle.std()) > 4 outlayer += abs((dscale - dscale.mean()) / dscale.std()) > 4 # print(outlayer) outlayersum = outlayer.sum() if outlayersum > 0 and not numpy.isinf(outlayersum): matching2 = matching[outlayer == 0] transform_matrix = matching_correction(matching2) offset = numpy.array( [transform_matrix[5], transform_matrix[2]], dtype=numpy.float32) matrix = numpy.empty((2, 2), dtype=numpy.float32) matrix[0, 0], matrix[ 0, 1] = transform_matrix[4], transform_matrix[3] matrix[1, 0], matrix[ 1, 1] = transform_matrix[1], transform_matrix[0] if relative: # update stable part to perform a relative alignment self.ref_kp = kp if self.ROI is not None: kpx = numpy.round(self.ref_kp.x).astype(numpy.int32) kpy = numpy.round(self.ref_kp.y).astype(numpy.int32) masked = self.ROI[(kpy, kpx)].astype(bool) logger.warning( "Reducing keypoint list from %i to %i because of the ROI" % (self.ref_kp.size, masked.sum())) self.ref_kp = self.ref_kp[masked] self.buffers["ref_kp_gpu"] = pyopencl.array.to_device( self.match.queue, self.ref_kp) transfo = numpy.zeros((3, 3), dtype=numpy.float64) transfo[:2, :2] = matrix transfo[0, 2] = offset[0] transfo[1, 2] = offset[1] transfo[2, 2] = 1 if self.relative_transfo is None: self.relative_transfo = transfo else: self.relative_transfo = numpy.dot(transfo, self.relative_transfo) matrix = numpy.ascontiguousarray(self.relative_transfo[:2, :2], dtype=numpy.float32) offset = numpy.ascontiguousarray(self.relative_transfo[:2, 2], dtype=numpy.float32) # print(self.relative_transfo) cpy1 = pyopencl.enqueue_copy(self.queue, self.buffers["matrix"].data, matrix) cpy2 = pyopencl.enqueue_copy(self.queue, self.buffers["offset"].data, offset) if self.profile: self.events += [("Copy matrix", cpy1), ("Copy offset", cpy2)] if self.RGB: shape = (4, self.shape[1], self.shape[0]) transform = self.program.transform_RGB else: shape = self.shape[1], self.shape[0] transform = self.program.transform # print(kernel_workgroup_size(self.program, transform), self.wg, self.ctx.devices[0].max_work_item_sizes) ev = transform(self.queue, calc_size(shape, self.wg), self.wg, self.buffers["input"].data, self.buffers["output"].data, self.buffers["matrix"].data, self.buffers["offset"].data, numpy.int32(self.shape[1]), numpy.int32(self.shape[0]), numpy.int32(self.outshape[1]), numpy.int32(self.outshape[0]), self.sift.buffers["min"].get()[0], numpy.int32(1)) if self.profile: self.events += [("transform", ev)] result = self.buffers["output"].get() # print (self.buffers["offset"]) if return_all: # corr = numpy.dot(matrix, numpy.vstack((matching[:, 1].y, matching[:, 1].x))).T - \ # offset.T - numpy.vstack((matching[:, 0].y, matching[:, 0].x)).T corr = numpy.dot( matrix, numpy.vstack( (matching[:, 0].y, matching[:, 0].x))).T + offset.T - numpy.vstack( (matching[:, 1].y, matching[:, 1].x)).T rms = numpy.sqrt((corr * corr).sum(axis=-1).mean()) # Todo: calculate the RMS of deplacement and return it: return { "result": result, "keypoint": kp, "matching": matching, "offset": offset, "matrix": matrix, "rms": rms } return result
def _one_octave(self, octave): """ Does all scales within an octave :param octave: number of the octave """ prevSigma = self._init_sigma logger.info("Calculating octave %i" % octave) wgsize = (128,) # (max(self.wgsize[octave]),) #TODO: optimize kpsize32 = numpy.int32(self.kpsize) self._reset_keypoints() octsize = numpy.int32(2 ** octave) last_start = numpy.int32(0) for scale in range(par.Scales + 2): sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0) logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma)) ######################################################################## # Calculate gaussian blur and DoG ######################################################################## self._gaussian_convolution(self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave) prevSigma *= self.sigmaRatio evt = self.kernels.get_kernel("combine")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0), self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0), self.cl_mem["DoGs"].data, numpy.int32(scale), *self.scales[octave]) if self.profile: self.events.append(("DoG %s %s" % (octave, scale), evt)) for scale in range(1, par.Scales + 1): evt = self.kernels.get_kernel("local_maxmin")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* output, numpy.int32(par.BorderDist), # int border_dist, numpy.float32(par.PeakThresh), # float peak_thresh, octsize, # int octsize, numpy.float32(par.EdgeThresh1), # float EdgeThresh0, numpy.float32(par.EdgeThresh), # float EdgeThresh, self.cl_mem["cnt"].data, # __global int* counter, kpsize32, # int nb_keypoints, numpy.int32(scale), # int scale, *self.scales[octave]) # int width, int height) if self.profile: self.events.append(("local_maxmin %s %s" % (octave, scale), evt)) procsize = calc_size((self.kpsize,), wgsize) cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory evt = self.kernels.get_kernel("interp_keypoint")(self.queue, procsize, wgsize, self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, last_start, # int start_keypoint, self.cnt[0], # int end_keypoint, numpy.float32(par.PeakThresh), # float peak_thresh, numpy.float32(self._init_sigma), # float InitSigma, *self.scales[octave]) # int width, int height) if self.profile: self.events += [("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt) ] newcnt = self._compact(last_start) evt = self.kernels.get_kernel("compute_gradient_orientation")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%s" % (scale)].data, # __global float* igray, self.cl_mem["tmp"].data, # __global float *grad, self.cl_mem["ori"].data, # __global float *ori, *self.scales[octave]) # int width,int height if self.profile: self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt)) # Orientation assignement: 1D kernel, rather heavy kernel if newcnt and newcnt > last_start: # launch kernel only if neededwgsize = (128,) if self.USE_CPU: orientation_name = "orientation_cpu" scales = self.scales[octave] else: orientation_name = "orientation_gpu" scales = list(self.scales[octave]) + \ [pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4)] orientation = self.kernels.get_kernel(orientation_name) wg = self.kernels_max_wg_size[orientation_name] wgsize2 = (wg,) procsize = (int(newcnt * wg),) evt = orientation(self.queue, procsize, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, self.cl_mem["cnt"].data, # __global int* counter, octsize, # int octsize, numpy.float32(par.OriSigma), # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)" kpsize32, # int max of nb_keypoints, numpy.int32(last_start), # int keypoints_start, newcnt, # int keypoints_end, *scales) # int grad_width, int grad_height) # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above ! evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) newcnt = self.cnt[0] # do not forget to update numbers of keypoints, modified above ! for _ in range(3): # up to 3 attempts if self.USE_CPU or (self.LOW_END > 1): logger.info("Computing descriptors with CPU optimized kernels") descriptor_name = "descriptor_cpu" wg = self.kernels_max_wg_size[descriptor_name][0] wgsize2 = (wg,) procsize2 = (int(newcnt * wg),) else: if self.LOW_END: logger.info("Computing descriptors with older-GPU optimized kernels") descriptor_name = "descriptor_gpu1" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) # if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # # will fail anyway: # self.LOW_END += 1 # continue else: logger.info("Computing descriptors with newer-GPU optimized kernels") descriptor_name = "descriptor_gpu2" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) # if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # # will fail anyway: # self.LOW_END += 1 # continue try: descriptor = self.kernels.get_kernel(descriptor_name) evt2 = descriptor(self.queue, procsize2, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["descriptors"].data, # ___global unsigned char *descriptors self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, octsize, # int octsize, numpy.int32(last_start), # int keypoints_start, self.cl_mem["cnt"].data, # int* keypoints_end, *self.scales[octave]) # int grad_width, int grad_height) evt2.wait() except (pyopencl.RuntimeError, pyopencl._cl.LogicError) as error: self.LOW_END += 1 logger.error("Descriptor failed with %s. Switching to lower_end mode" % error) continue else: break if self.profile: self.events += [("%s %s %s" % (orientation_name, octave, scale), evt), ("copy cnt D->H", evt_cp), ("%s %s %s" % (descriptor_name, octave, scale), evt2)] evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) last_start = self.cnt[0] if self.profile: self.events.append(("copy cnt D->H", evt_cp)) ######################################################################## # Rescale all images to populate all octaves ######################################################################## if octave < self.octave_max - 1: evt = self.kernels.get_kernel("shrink")(self.queue, self.procsize[octave + 1], self.wgsize[octave + 1], self.cl_mem["scale_%i" % (par.Scales)].data, self.cl_mem["scale_0"].data, numpy.int32(2), numpy.int32(2), self.scales[octave][0], self.scales[octave][1], *self.scales[octave + 1]) if self.profile: self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt)) results = numpy.empty((last_start, 4), dtype=numpy.float32) descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8) if last_start: evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data) evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.cl_mem["descriptors"].data) if self.profile: self.events += [("copy D->H", evt), ("copy D->H", evt2)] return results, descriptors
def keypoints(self, image, mask=None): """Calculates the keypoints of the image TODO: use a temporary list with events and use a single test at the end :param image: ndimage of 2D (or 3D if RGB) :param mask: TODO: implement a mask for sieving out the keypoints :return: vector of keypoint (1D numpy array) """ # self.reset_timer() with self.sem: total_size = 0 keypoints = [] descriptors = [] assert image.shape[:2] == self.shape assert image.dtype in [self.dtype, numpy.float32] # old versions of pyopencl do not check for data contiguity if not(isinstance(image, pyopencl.array.Array)) and not(image.flags["C_CONTIGUOUS"]): image = numpy.ascontiguousarray(image) t0 = time.time() if image.dtype == numpy.float32: if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image) if self.profile: self.events.append(("copy H->D", evt)) elif self.dtype == numpy.float64: # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64) # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32 # and uncomment it if it proves faster (dubious, because of data transfer bottleneck) evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.astype(numpy.float32)) if self.profile: self.events.append(("copy H->D", evt)) elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB): if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = self.kernels.get_kernel("rgb_to_float")(self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0]) if self.profile: self.events.append(("RGB -> float", evt)) elif self.dtype in self.converter: program = self.kernels.get_kernel(self.converter[self.dtype]) evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = program(self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0]) if self.profile: self.events.append(("convert -> float", evt)) else: raise RuntimeError("invalid input format error (%s)" % (str(self.dtype))) wg1 = self.kernels_wg["max_min_global_stage1"] wg2 = self.kernels_wg["max_min_global_stage2"] if min(wg1, wg2) < self.red_size: # common bug on OSX when running on CPU logger.info("Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s", wg1, wg2, self.block_size, self.red_size) kernel = self.kernels.get_kernel("max_min_vec16") k = kernel(self.queue, (1,), (1,), self.cl_mem["scale_0"].data, numpy.int32(self.shape[0] * self.shape[1]), self.cl_mem["max"].data, self.cl_mem["min"].data) if self.profile: self.events.append(("max_min_serial", k)) # python implementation: # buffer_ = self.cl_mem["scale_0"].get() # self.cl_mem["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32)) # self.cl_mem["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32)) else: kernel1 = self.kernels.get_kernel("max_min_global_stage1") kernel2 = self.kernels.get_kernel("max_min_global_stage2") # logger.debug("self.red_size: %s", self.red_size) shm = pyopencl.LocalMemory(self.red_size * 2 * 4) k1 = kernel1(self.queue, (self.red_size * self.red_size,), (self.red_size,), self.cl_mem["scale_0"].data, self.cl_mem["max_min"].data, numpy.int32(self.shape[0] * self.shape[1]), shm) k2 = kernel2(self.queue, (self.red_size,), (self.red_size,), self.cl_mem["max_min"].data, self.cl_mem["max"].data, self.cl_mem["min"].data, shm) if self.profile: self.events.append(("max_min_stage1", k1)) self.events.append(("max_min_stage2", k2)) evt = self.kernels.get_kernel("normalizes")(self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["scale_0"].data, self.cl_mem["min"].data, self.cl_mem["max"].data, self.cl_mem["255"].data, *self.scales[0]) if self.profile: self.events.append(("normalize", evt)) curSigma = 1.0 if par.DoubleImSize else 0.5 octave = 0 if self._init_sigma > curSigma: logger.debug("Bluring image to achieve std: %f", self._init_sigma) sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2) self._gaussian_convolution(self.cl_mem["scale_0"], self.cl_mem["scale_0"], sigma, 0) for octave in range(self.octave_max): kp, descriptor = self._one_octave(octave) logger.info("in octave %i found %i kp" % (octave, kp.shape[0])) if len(kp): # sieve out coordinates with NaNs mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1)))) keypoints.append(kp[mask]) descriptors.append(descriptor[mask]) total_size += len(mask[0]) ######################################################################## # Merge keypoints in central memory ######################################################################## output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp) last = 0 for ds, desc in zip(keypoints, descriptors): l = ds.shape[0] if l > 0: output[last:last + l].x = ds[:, 0] output[last:last + l].y = ds[:, 1] output[last:last + l].scale = ds[:, 2] output[last:last + l].angle = ds[:, 3] output[last:last + l].desc = desc last += l logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0))) return output
def _one_octave(self, octave): """ Does all scales within an octave :param octave: number of the octave """ prevSigma = self._init_sigma logger.info("Calculating octave %i" % octave) wgsize = (128,) # (max(self.wgsize[octave]),) #TODO: optimize kpsize32 = numpy.int32(self.kpsize) self._reset_keypoints() octsize = numpy.int32(2 ** octave) last_start = numpy.int32(0) for scale in range(par.Scales + 2): sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0) logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma)) ######################################################################## # Calculate gaussian blur and DoG ######################################################################## self._gaussian_convolution(self.buffers[scale], self.buffers[scale + 1], sigma, octave) prevSigma *= self.sigmaRatio evt = self.programs["algebra"].combine( self.queue, self.procsize[octave], self.wgsize[octave], self.buffers[scale + 1].data, numpy.float32(-1.0), self.buffers[scale].data, numpy.float32(+1.0), self.buffers["DoGs"].data, numpy.int32(scale), *self.scales[octave] ) if self.profile: self.events.append(("DoG %s %s" % (octave, scale), evt)) for scale in range(1, par.Scales + 1): evt = self.programs["image"].local_maxmin( self.queue, self.procsize[octave], self.wgsize[octave], self.buffers["DoGs"].data, # __global float* DOGS, self.buffers["Kp_1"].data, # __global keypoint* output, numpy.int32(par.BorderDist), # int border_dist, numpy.float32(par.PeakThresh), # float peak_thresh, octsize, # int octsize, numpy.float32(par.EdgeThresh1), # float EdgeThresh0, numpy.float32(par.EdgeThresh), # float EdgeThresh, self.buffers["cnt"].data, # __global int* counter, kpsize32, # int nb_keypoints, numpy.int32(scale), # int scale, *self.scales[octave] ) # int width, int height) if self.profile: self.events.append(("local_maxmin %s %s" % (octave, scale), evt)) procsize = calc_size((self.kpsize,), wgsize) cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory evt = self.programs["image"].interp_keypoint( self.queue, procsize, wgsize, self.buffers["DoGs"].data, # __global float* DOGS, self.buffers["Kp_1"].data, # __global keypoint* keypoints, last_start, # int start_keypoint, self.cnt[0], # int end_keypoint, numpy.float32(par.PeakThresh), # float peak_thresh, numpy.float32(self._init_sigma), # float InitSigma, *self.scales[octave] ) # int width, int height) if self.profile: self.events += [("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt)] newcnt = self._compact(last_start) evt = self.programs["image"].compute_gradient_orientation( self.queue, self.procsize[octave], self.wgsize[octave], self.buffers[scale].data, # __global float* igray, self.buffers["tmp"].data, # __global float *grad, self.buffers["ori"].data, # __global float *ori, *self.scales[octave] ) # int width,int height if self.profile: self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt)) # Orientation assignement: 1D kernel, rather heavy kernel if newcnt and newcnt > last_start: # launch kernel only if neededwgsize = (128,) if self.USE_CPU: file_to_use = "orientation_cpu" # logger.info("Computing orientation with CPU-optimized kernels") else: file_to_use = "orientation_gpu" wgsize2 = (self.kernels[file_to_use],) procsize = (int(newcnt * wgsize2[0]),) evt = self.programs[file_to_use].orientation_assignment( self.queue, procsize, wgsize2, self.buffers["Kp_1"].data, # __global keypoint* keypoints, self.buffers["tmp"].data, # __global float* grad, self.buffers["ori"].data, # __global float* ori, self.buffers["cnt"].data, # __global int* counter, octsize, # int octsize, numpy.float32(par.OriSigma), # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)" kpsize32, # int max of nb_keypoints, numpy.int32(last_start), # int keypoints_start, newcnt, # int keypoints_end, *self.scales[octave] ) # int grad_width, int grad_height) # newcnt = self.buffers["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above ! evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) newcnt = self.cnt[0] # do not forget to update numbers of keypoints, modified above ! for i_not_used in range(3): # up to 3 attempts if (not self.USE_CPU) and (self.LOW_END == 0) and ("keypoints_gpu2" in self.kernels): file_to_use = "keypoints_gpu2" logger.info("Computing descriptors with newer-GPU optimized kernels") wgsize2 = self.kernels[file_to_use] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) elif (not self.USE_CPU) and (self.LOW_END == 1) and ("keypoints_gpu1" in self.kernels): file_to_use = "keypoints_gpu1" logger.info("Computing descriptors with older-GPU optimized kernels") wgsize2 = self.kernels[file_to_use] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) else: # self.USE_CPU or self.LOW_END == 2, fail-safe fall-back file_to_use = "keypoints_cpu" logger.info("Computing descriptors with CPU optimized kernels") wgsize2 = (self.kernels[file_to_use],) procsize2 = (int(newcnt * wgsize2[0]),) try: evt2 = self.programs[file_to_use].descriptor( self.queue, procsize2, wgsize2, self.buffers["Kp_1"].data, # __global keypoint* keypoints, self.buffers["descriptors"].data, # ___global unsigned char *descriptors self.buffers["tmp"].data, # __global float* grad, self.buffers["ori"].data, # __global float* ori, octsize, # int octsize, numpy.int32(last_start), # int keypoints_start, self.buffers["cnt"].data, # int* keypoints_end, *self.scales[octave] ) # int grad_width, int grad_height) except pyopencl.RuntimeError as error: self.LOW_END += 1 logger.error("Descriptor failed with %s. Switching to lower_end mode" % error) continue else: break if self.profile: self.events += [ ("orientation_assignment %s %s" % (octave, scale), evt), ("copy cnt D->H", evt_cp), ("descriptors %s %s" % (octave, scale), evt2), ] evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) last_start = self.cnt[0] if self.profile: self.events.append(("copy cnt D->H", evt_cp)) ######################################################################## # Rescale all images to populate all octaves ######################################################################## if octave < self.octave_max - 1: evt = self.programs["preprocess"].shrink( self.queue, self.procsize[octave + 1], self.wgsize[octave + 1], self.buffers[par.Scales].data, self.buffers[0].data, numpy.int32(2), numpy.int32(2), self.scales[octave][0], self.scales[octave][1], *self.scales[octave + 1] ) if self.profile: self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt)) results = numpy.empty((last_start, 4), dtype=numpy.float32) descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8) if last_start: evt = pyopencl.enqueue_copy(self.queue, results, self.buffers["Kp_1"].data) evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.buffers["descriptors"].data) if self.profile: self.events += [("copy D->H", evt), ("copy D->H", evt2)] return results, descriptors
def keypoints(self, image): """Calculates the keypoints of the image :param image: ndimage of 2D (or 3D if RGB) :return: vector of keypoint (1D numpy array) """ self.reset_timer() with self._sem: total_size = 0 keypoints = [] descriptors = [] assert image.shape[:2] == self.shape assert image.dtype in [self.dtype, numpy.float32] # old versions of pyopencl do not check for data contiguity if not (isinstance(image, pyopencl.array.Array)) and not (image.flags["C_CONTIGUOUS"]): image = numpy.ascontiguousarray(image) t0 = time.time() if image.dtype == numpy.float32: if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image) if self.profile: self.events.append(("copy H->D", evt)) elif self.dtype == numpy.float64: # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64) # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32 # and uncomment it if it proves faster (dubious, because of data transfer bottleneck) evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image.astype(numpy.float32)) if self.profile: self.events.append(("copy H->D", evt)) elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB): if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = self.programs["preprocess"].rgb_to_float( self.queue, self.procsize[0], self.wgsize[0], self.buffers["raw"].data, self.buffers[0].data, *self.scales[0] ) if self.profile: self.events.append(("RGB -> float", evt)) elif self.dtype in self.converter: program = self.programs["preprocess"].__getattr__(self.converter[self.dtype]) evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = program( self.queue, self.procsize[0], self.wgsize[0], self.buffers["raw"].data, self.buffers[0].data, *self.scales[0] ) if self.profile: self.events.append(("convert -> float", evt)) else: raise RuntimeError("invalid input format error (%s)" % (str(self.dtype))) wg1 = self.kernels["reductions.max_min_global_stage1"] wg2 = self.kernels["reductions.max_min_global_stage2"] if min(wg1, wg2) < self.red_size: # common bug on OSX when running on CPU logger.info( "Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s", wg1, wg2, self.max_workgroup_size, self.red_size, ) kernel = self.programs["reductions"].max_min_serial k = kernel( self.queue, (1,), (1,), self.buffers[0].data, numpy.uint32(self.shape[0] * self.shape[1]), self.buffers["max"].data, self.buffers["min"].data, ) if self.profile: self.events.append(("max_min_serial", k)) # python implementation: # buffer_ = self.buffers[0].get() # self.buffers["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32)) # self.buffers["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32)) else: kernel1 = self.programs["reductions"].max_min_global_stage1 kernel2 = self.programs["reductions"].max_min_global_stage2 # logger.debug("self.red_size: %s", self.red_size) k1 = kernel1( self.queue, (self.red_size * self.red_size,), (self.red_size,), self.buffers[0].data, self.buffers["max_min"].data, numpy.uint32(self.shape[0] * self.shape[1]), ) k2 = kernel2( self.queue, (self.red_size,), (self.red_size,), self.buffers["max_min"].data, self.buffers["max"].data, self.buffers["min"].data, ) if self.profile: self.events.append(("max_min_stage1", k1)) self.events.append(("max_min_stage2", k2)) evt = self.programs["preprocess"].normalizes( self.queue, self.procsize[0], self.wgsize[0], self.buffers[0].data, self.buffers["min"].data, self.buffers["max"].data, self.buffers["255"].data, *self.scales[0] ) if self.profile: self.events.append(("normalize", evt)) curSigma = 1.0 if par.DoubleImSize else 0.5 octave = 0 if self._init_sigma > curSigma: logger.debug("Bluring image to achieve std: %f", self._init_sigma) sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2) self._gaussian_convolution(self.buffers[0], self.buffers[0], sigma, 0) for octave in range(self.octave_max): kp, descriptor = self._one_octave(octave) logger.info("in octave %i found %i kp" % (octave, kp.shape[0])) if len(kp): # sieve out coordinates with NaNs mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1)))) keypoints.append(kp[mask]) descriptors.append(descriptor[mask]) total_size += len(mask[0]) ######################################################################## # Merge keypoints in central memory ######################################################################## output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp) last = 0 for ds, desc in zip(keypoints, descriptors): l = ds.shape[0] if l > 0: output[last : last + l].x = ds[:, 0] output[last : last + l].y = ds[:, 1] output[last : last + l].scale = ds[:, 2] output[last : last + l].angle = ds[:, 3] output[last : last + l].desc = desc last += l logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0))) return output
def match(self, nkp1, nkp2, raw_results=False): """Calculate the matching of 2 keypoint list :param nkp1, nkp2: numpy 1D recarray of keypoints or equivalent GPU buffer :param raw_results: if true return the 2D array of indexes of matching keypoints (not the actual keypoints) TODO: implement the ROI ... """ assert len(nkp1.shape) == 1 # Nota: nkp1.ndim is not valid for gpu_arrays assert len(nkp2.shape) == 1 valid_types = (numpy.ndarray, numpy.core.records.recarray, pyopencl.array.Array) assert isinstance(nkp1, valid_types) assert isinstance(nkp2, valid_types) result = None with self._sem: if isinstance(nkp1, pyopencl.array.Array): kpt1_gpu = nkp1 else: if nkp1.size > self.buffers["Kp_1"].size: logger.warning("increasing size of keypoint vector 1 to %i" % nkp1.size) self.buffers["Kp_1"] = pyopencl.array.empty(self.queue, (nkp1.size,), dtype=self.dtype_kp) kpt1_gpu = self.buffers["Kp_1"] self._reset_buffer1() evt1 = pyopencl.enqueue_copy(self.queue, kpt1_gpu.data, nkp1) if self.profile: self.events.append(("copy H->D KP_1", evt1)) if isinstance(nkp2, pyopencl.array.Array): kpt2_gpu = nkp2 else: if nkp2.size > self.buffers["Kp_2"].size: logger.warning("increasing size of keypoint vector 2 to %i" % nkp2.size) self.buffers["Kp_2"] = pyopencl.array.empty(self.queue, (nkp2.size,), dtype=self.dtype_kp) kpt2_gpu = self.buffers["Kp_2"] self._reset_buffer2() evt2 = pyopencl.enqueue_copy(self.queue, kpt2_gpu.data, nkp2) if self.profile: self.events.append(("copy H->D KP_2", evt2)) if min(kpt1_gpu.size, kpt2_gpu.size) > self.buffers["match"].shape[0]: self.kpsize = min(kpt1_gpu.size, kpt2_gpu.size) self.buffers["match"] = pyopencl.array.empty(self.queue, (self.kpsize, 2), dtype=numpy.int32) self._reset_output() wg = self.kernels[self.matching_kernel+".matching"] size = calc_size((nkp1.size,), (wg,)) evt = self.programs[self.matching_kernel].matching(self.queue, size, (wg,), kpt1_gpu.data, kpt2_gpu.data, self.buffers["match"].data, self.buffers["cnt"].data, numpy.int32(self.kpsize), numpy.float32(par.MatchRatio * par.MatchRatio), numpy.int32(nkp1.size), numpy.int32(nkp2.size)) if self.profile: self.events.append(("matching", evt)) size = self.buffers["cnt"].get()[0] match = numpy.empty(shape=(size, 2), dtype=numpy.int32) if size > 0: cpyD2H = pyopencl.enqueue_copy(self.queue, match, self.buffers["match"].data) if self.profile: self.events.append(("copy D->H match", cpyD2H)) if raw_results: result = match else: result = numpy.recarray(shape=(size, 2), dtype=self.dtype_kp) result[:, 0] = nkp1[match[:size, 0]] result[:, 1] = nkp2[match[:size, 1]] return result
def keypoints(self, image): """Calculates the keypoints of the image :param image: ndimage of 2D (or 3D if RGB) :return: vector of keypoint (1D numpy array) """ self.reset_timer() with self._sem: total_size = 0 keypoints = [] descriptors = [] assert image.shape[:2] == self.shape assert image.dtype in [self.dtype, numpy.float32] t0 = time.time() if image.dtype == numpy.float32: if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image) if self.profile: self.events.append(("copy H->D", evt)) elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB): if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image.data) else: evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = self.programs["preprocess"].rgb_to_float(self.queue, self.procsize[0], self.wgsize[0], self.buffers["raw"].data, self.buffers[0].data, *self.scales[0]) if self.profile: self.events.append(("RGB -> float", evt)) elif self.dtype in self.converter: program = self.programs["preprocess"].__getattr__(self.converter[self.dtype]) evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = program(self.queue, self.procsize[0], self.wgsize[0], self.buffers["raw"].data, self.buffers[0].data, *self.scales[0]) if self.profile: self.events.append(("convert -> float", evt)) else: raise RuntimeError("invalid input format error") k1 = self.programs["reductions"].max_min_global_stage1(self.queue, (self.red_size * self.red_size,), (self.red_size,), self.buffers[0].data, self.buffers["max_min"].data, numpy.uint32(self.shape[0] * self.shape[1])) k2 = self.programs["reductions"].max_min_global_stage2(self.queue, (self.red_size,), (self.red_size,), self.buffers["max_min"].data, self.buffers["max"].data, self.buffers["min"].data) if self.profile: self.events.append(("max_min_stage1", k1)) self.events.append(("max_min_stage2", k2)) evt = self.programs["preprocess"].normalizes(self.queue, self.procsize[0], self.wgsize[0], self.buffers[0].data, self.buffers["min"].data, self.buffers["max"].data, self.buffers["255"].data, *self.scales[0]) if self.profile: self.events.append(("normalize", evt)) curSigma = 1.0 if par.DoubleImSize else 0.5 octave = 0 if self._init_sigma > curSigma: logger.debug("Bluring image to achieve std: %f", self._init_sigma) sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2) self._gaussian_convolution(self.buffers[0], self.buffers[0], sigma, 0) for octave in range(self.octave_max): kp, descriptor = self._one_octave(octave) logger.info("in octave %i found %i kp" % (octave, kp.shape[0])) if kp.shape[0] > 0: keypoints.append(kp) descriptors.append(descriptor) total_size += kp.shape[0] ######################################################################## # Merge keypoints in central memory ######################################################################## output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp) last = 0 for ds, desc in zip(keypoints, descriptors): l = ds.shape[0] if l > 0: output[last:last + l].x = ds[:, 0] output[last:last + l].y = ds[:, 1] output[last:last + l].scale = ds[:, 2] output[last:last + l].angle = ds[:, 3] output[last:last + l].desc = desc last += l logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0))) return output
def match(self, nkp1, nkp2, raw_results=False): """Calculate the matching of 2 keypoint list :param nkp1: numpy 1D recarray of keypoints or equivalent GPU buffer :param nkp2: numpy 1D recarray of keypoints or equivalent GPU buffer :param raw_results: if true return the 2D array of indexes of matching keypoints (not the actual keypoints) TODO: implement the ROI ... """ assert len( nkp1.shape) == 1 # Nota: nkp1.ndim is not valid for gpu_arrays assert len(nkp2.shape) == 1 valid_types = (numpy.ndarray, numpy.core.records.recarray, pyopencl.array.Array) assert isinstance(nkp1, valid_types) assert isinstance(nkp2, valid_types) result = None with self._sem: if isinstance(nkp1, pyopencl.array.Array): kpt1_gpu = nkp1 else: if nkp1.size > self.buffers["Kp_1"].size: logger.warning( "increasing size of keypoint vector 1 to %i" % nkp1.size) self.buffers["Kp_1"] = pyopencl.array.empty( self.queue, (nkp1.size, ), dtype=self.dtype_kp) kpt1_gpu = self.buffers["Kp_1"] self._reset_buffer1() evt1 = pyopencl.enqueue_copy(self.queue, kpt1_gpu.data, nkp1) if self.profile: self.events.append(("copy H->D KP_1", evt1)) if isinstance(nkp2, pyopencl.array.Array): kpt2_gpu = nkp2 else: if nkp2.size > self.buffers["Kp_2"].size: logger.warning( "increasing size of keypoint vector 2 to %i" % nkp2.size) self.buffers["Kp_2"] = pyopencl.array.empty( self.queue, (nkp2.size, ), dtype=self.dtype_kp) kpt2_gpu = self.buffers["Kp_2"] self._reset_buffer2() evt2 = pyopencl.enqueue_copy(self.queue, kpt2_gpu.data, nkp2) if self.profile: self.events.append(("copy H->D KP_2", evt2)) if min(kpt1_gpu.size, kpt2_gpu.size) > self.buffers["match"].shape[0]: self.kpsize = min(kpt1_gpu.size, kpt2_gpu.size) self.buffers["match"] = pyopencl.array.empty(self.queue, (self.kpsize, 2), dtype=numpy.int32) self._reset_output() wg = self.kernels[self.matching_kernel + ".matching"] size = calc_size((nkp1.size, ), (wg, )) evt = self.programs[self.matching_kernel].matching( self.queue, size, (wg, ), kpt1_gpu.data, kpt2_gpu.data, self.buffers["match"].data, self.buffers["cnt"].data, numpy.int32(self.kpsize), numpy.float32(par.MatchRatio * par.MatchRatio), numpy.int32(nkp1.size), numpy.int32(nkp2.size)) if self.profile: self.events.append(("matching", evt)) size = self.buffers["cnt"].get()[0] match = numpy.empty(shape=(size, 2), dtype=numpy.int32) if size > 0: cpyD2H = pyopencl.enqueue_copy(self.queue, match, self.buffers["match"].data) if self.profile: self.events.append(("copy D->H match", cpyD2H)) if raw_results: result = match else: result = numpy.recarray(shape=(size, 2), dtype=self.dtype_kp) result[:, 0] = nkp1[match[:size, 0]] result[:, 1] = nkp2[match[:size, 1]] return result
def _one_octave(self, octave): """ Does all scales within an octave :param octave: number of the octave """ prevSigma = self._init_sigma logger.info("Calculating octave %i" % octave) wgsize = (128,) # (max(self.wgsize[octave]),) #TODO: optimize kpsize32 = numpy.int32(self.kpsize) self._reset_keypoints() octsize = numpy.int32(2 ** octave) last_start = numpy.int32(0) for scale in range(par.Scales + 2): sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0) logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma)) ######################################################################## # Calculate gaussian blur and DoG ######################################################################## self._gaussian_convolution(self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave) prevSigma *= self.sigmaRatio evt = self.kernels.get_kernel("combine")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0), self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0), self.cl_mem["DoGs"].data, numpy.int32(scale), *self.scales[octave]) if self.profile: self.events.append(("DoG %s %s" % (octave, scale), evt)) for scale in range(1, par.Scales + 1): evt = self.kernels.get_kernel("local_maxmin")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* output, numpy.int32(par.BorderDist), # int border_dist, numpy.float32(par.PeakThresh), # float peak_thresh, octsize, # int octsize, numpy.float32(par.EdgeThresh1), # float EdgeThresh0, numpy.float32(par.EdgeThresh), # float EdgeThresh, self.cl_mem["cnt"].data, # __global int* counter, kpsize32, # int nb_keypoints, numpy.int32(scale), # int scale, *self.scales[octave]) # int width, int height) if self.profile: self.events.append(("local_maxmin %s %s" % (octave, scale), evt)) procsize = calc_size((self.kpsize,), wgsize) cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory evt = self.kernels.get_kernel("interp_keypoint")(self.queue, procsize, wgsize, self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, last_start, # int start_keypoint, self.cnt[0], # int end_keypoint, numpy.float32(par.PeakThresh), # float peak_thresh, numpy.float32(self._init_sigma), # float InitSigma, *self.scales[octave]) # int width, int height) if self.profile: self.events += [("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt) ] newcnt = self._compact(last_start) evt = self.kernels.get_kernel("compute_gradient_orientation")(self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%s" % (scale)].data, # __global float* igray, self.cl_mem["tmp"].data, # __global float *grad, self.cl_mem["ori"].data, # __global float *ori, *self.scales[octave]) # int width,int height if self.profile: self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt)) # Orientation assignement: 1D kernel, rather heavy kernel if newcnt and newcnt > last_start: # launch kernel only if neededwgsize = (128,) if self.USE_CPU: orientation_name = "orientation_cpu" scales = self.scales[octave] else: orientation_name = "orientation_gpu" scales = list(self.scales[octave]) + \ [pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4)] orientation = self.kernels.get_kernel(orientation_name) wg = self.kernels_max_wg_size[orientation_name] wgsize2 = (wg,) procsize = (int(newcnt * wg),) evt = orientation(self.queue, procsize, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, self.cl_mem["cnt"].data, # __global int* counter, octsize, # int octsize, numpy.float32(par.OriSigma), # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)" kpsize32, # int max of nb_keypoints, numpy.int32(last_start), # int keypoints_start, newcnt, # int keypoints_end, *scales) # int grad_width, int grad_height) # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above ! evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) newcnt = self.cnt[0] # do not forget to update numbers of keypoints, modified above ! for _ in range(3): # up to 3 attempts if self.USE_CPU or (self.LOW_END > 1): logger.info("Computing descriptors with CPU optimized kernels") descriptor_name = "descriptor_cpu" wg = self.kernels_max_wg_size[descriptor_name][0] wgsize2 = (wg,) procsize2 = (int(newcnt * wg),) else: if self.LOW_END: logger.info("Computing descriptors with older-GPU optimized kernels") descriptor_name = "descriptor_gpu1" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # will fail anyway: self.LOW_END += 1 continue else: logger.info("Computing descriptors with newer-GPU optimized kernels") descriptor_name = "descriptor_gpu2" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # will fail anyway: self.LOW_END += 1 continue try: descriptor = self.kernels.get_kernel(descriptor_name) evt2 = descriptor(self.queue, procsize2, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["descriptors"].data, # ___global unsigned char *descriptors self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, octsize, # int octsize, numpy.int32(last_start), # int keypoints_start, self.cl_mem["cnt"].data, # int* keypoints_end, *self.scales[octave]) # int grad_width, int grad_height) evt2.wait() except pyopencl.RuntimeError as error: self.LOW_END += 1 logger.error("Descriptor failed with %s. Switching to lower_end mode" % error) continue else: break if self.profile: self.events += [("%s %s %s" % (orientation_name, octave, scale), evt), ("copy cnt D->H", evt_cp), ("%s %s %s" % (descriptor_name, octave, scale), evt2)] evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) last_start = self.cnt[0] if self.profile: self.events.append(("copy cnt D->H", evt_cp)) ######################################################################## # Rescale all images to populate all octaves ######################################################################## if octave < self.octave_max - 1: evt = self.kernels.get_kernel("shrink")(self.queue, self.procsize[octave + 1], self.wgsize[octave + 1], self.cl_mem["scale_%i" % (par.Scales)].data, self.cl_mem["scale_0"].data, numpy.int32(2), numpy.int32(2), self.scales[octave][0], self.scales[octave][1], *self.scales[octave + 1]) if self.profile: self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt)) results = numpy.empty((last_start, 4), dtype=numpy.float32) descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8) if last_start: evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data) evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.cl_mem["descriptors"].data) if self.profile: self.events += [("copy D->H", evt), ("copy D->H", evt2)] return results, descriptors
def _one_octave(self, octave): """ Does all scales within an octave :param octave: number of the octave """ prevSigma = self._init_sigma logger.info("Calculating octave %i" % octave) wgsize = (128, ) # (max(self.wgsize[octave]),) #TODO: optimize kpsize32 = numpy.int32(self.kpsize) self._reset_keypoints() octsize = numpy.int32(2**octave) last_start = numpy.int32(0) for scale in range(par.Scales + 2): sigma = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0) logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma)) ######################################################################## # Calculate gaussian blur and DoG ######################################################################## self._gaussian_convolution(self.buffers[scale], self.buffers[scale + 1], sigma, octave) prevSigma *= self.sigmaRatio evt = self.programs["algebra"].combine( self.queue, self.procsize[octave], self.wgsize[octave], self.buffers[scale + 1].data, numpy.float32(-1.0), self.buffers[scale].data, numpy.float32(+1.0), self.buffers["DoGs"].data, numpy.int32(scale), *self.scales[octave]) if self.profile: self.events.append(("DoG %s %s" % (octave, scale), evt)) for scale in range(1, par.Scales + 1): evt = self.programs["image"].local_maxmin( self.queue, self.procsize[octave], self.wgsize[octave], self.buffers["DoGs"].data, # __global float* DOGS, self.buffers["Kp_1"].data, # __global keypoint* output, numpy.int32(par.BorderDist), # int border_dist, numpy.float32(par.PeakThresh), # float peak_thresh, octsize, # int octsize, numpy.float32(par.EdgeThresh1), # float EdgeThresh0, numpy.float32(par.EdgeThresh), # float EdgeThresh, self.buffers["cnt"].data, # __global int* counter, kpsize32, # int nb_keypoints, numpy.int32(scale), # int scale, *self.scales[octave]) # int width, int height) if self.profile: self.events.append( ("local_maxmin %s %s" % (octave, scale), evt)) procsize = calc_size((self.kpsize, ), wgsize) cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory evt = self.programs["image"].interp_keypoint( self.queue, procsize, wgsize, self.buffers["DoGs"].data, # __global float* DOGS, self.buffers["Kp_1"].data, # __global keypoint* keypoints, last_start, # int start_keypoint, self.cnt[0], # int end_keypoint, numpy.float32(par.PeakThresh), # float peak_thresh, numpy.float32(self._init_sigma), # float InitSigma, *self.scales[octave]) # int width, int height) if self.profile: self.events += [("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt)] newcnt = self._compact(last_start) evt = self.programs["image"].compute_gradient_orientation( self.queue, self.procsize[octave], self.wgsize[octave], self.buffers[scale].data, # __global float* igray, self.buffers["tmp"].data, # __global float *grad, self.buffers["ori"].data, # __global float *ori, *self.scales[octave]) # int width,int height if self.profile: self.events.append( ("compute_gradient_orientation %s %s" % (octave, scale), evt)) # Orientation assignement: 1D kernel, rather heavy kernel if newcnt and newcnt > last_start: # launch kernel only if neededwgsize = (128,) if self.USE_CPU: file_to_use = "orientation_cpu" # logger.info("Computing orientation with CPU-optimized kernels") else: file_to_use = "orientation_gpu" wgsize2 = self.kernels[file_to_use], procsize = int(newcnt * wgsize2[0]), evt = self.programs[file_to_use].orientation_assignment( self.queue, procsize, wgsize2, self.buffers["Kp_1"].data, # __global keypoint* keypoints, self.buffers["tmp"].data, # __global float* grad, self.buffers["ori"].data, # __global float* ori, self.buffers["cnt"].data, # __global int* counter, octsize, # int octsize, numpy.float32( par.OriSigma ), # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)" kpsize32, # int max of nb_keypoints, numpy.int32(last_start), # int keypoints_start, newcnt, # int keypoints_end, *self.scales[octave]) # int grad_width, int grad_height) # newcnt = self.buffers["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above ! evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) newcnt = self.cnt[ 0] # do not forget to update numbers of keypoints, modified above ! for i_not_used in range(3): # up to 3 attempts if (not self.USE_CPU) and (self.LOW_END == 0) and ("keypoints_gpu2" in self.kernels): file_to_use = "keypoints_gpu2" logger.info( "Computing descriptors with newer-GPU optimized kernels" ) wgsize2 = self.kernels[file_to_use] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) elif (not self.USE_CPU) and (self.LOW_END == 1) and ("keypoints_gpu1" in self.kernels): file_to_use = "keypoints_gpu1" logger.info( "Computing descriptors with older-GPU optimized kernels" ) wgsize2 = self.kernels[file_to_use] procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2]) else: # self.USE_CPU or self.LOW_END == 2, fail-safe fall-back file_to_use = "keypoints_cpu" logger.info( "Computing descriptors with CPU optimized kernels") wgsize2 = self.kernels[file_to_use], procsize2 = (int(newcnt * wgsize2[0]), ) try: evt2 = self.programs[file_to_use].descriptor( self.queue, procsize2, wgsize2, self.buffers["Kp_1"]. data, # __global keypoint* keypoints, self.buffers["descriptors"]. data, # ___global unsigned char *descriptors self.buffers["tmp"].data, # __global float* grad, self.buffers["ori"].data, # __global float* ori, octsize, # int octsize, numpy.int32(last_start), # int keypoints_start, self.buffers["cnt"].data, # int* keypoints_end, *self.scales[octave] ) # int grad_width, int grad_height) except pyopencl.RuntimeError as error: self.LOW_END += 1 logger.error( "Descriptor failed with %s. Switching to lower_end mode" % error) continue else: break if self.profile: self.events += [ ("orientation_assignment %s %s" % (octave, scale), evt), ("copy cnt D->H", evt_cp), ("descriptors %s %s" % (octave, scale), evt2) ] evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data) last_start = self.cnt[0] if self.profile: self.events.append(("copy cnt D->H", evt_cp)) ######################################################################## # Rescale all images to populate all octaves ######################################################################## if octave < self.octave_max - 1: evt = self.programs["preprocess"].shrink( self.queue, self.procsize[octave + 1], self.wgsize[octave + 1], self.buffers[par.Scales].data, self.buffers[0].data, numpy.int32(2), numpy.int32(2), self.scales[octave][0], self.scales[octave][1], *self.scales[octave + 1]) if self.profile: self.events.append( ("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt)) results = numpy.empty((last_start, 4), dtype=numpy.float32) descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8) if last_start: evt = pyopencl.enqueue_copy(self.queue, results, self.buffers["Kp_1"].data) evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.buffers["descriptors"].data) if self.profile: self.events += [("copy D->H", evt), ("copy D->H", evt2)] return results, descriptors
def align(self, img, shift_only=False, return_all=False, double_check=False, relative=False, orsa=False): """ Align image on reference image :param img: numpy array containing the image to align to reference :param return_all: return in addition ot the image, keypoints, matching keypoints, and transformations as a dict :param reltive: update reference keypoints with those from current image to perform relative alignment :return: aligned image or all informations """ logger.debug("ref_keypoints: %s" % self.ref_kp.size) if self.RGB: data = numpy.ascontiguousarray(img, numpy.uint8) else: data = numpy.ascontiguousarray(img, numpy.float32) with self.sem: cpy = pyopencl.enqueue_copy(self.queue, self.buffers["input"].data, data) if self.profile: self.events.append(("Copy H->D", cpy)) cpy.wait() kp = self.sift.keypoints(self.buffers["input"]) # print("ref %s img %s" % (self.buffers["ref_kp_gpu"].shape, kp.shape)) logger.debug("mod image keypoints: %s" % kp.size) raw_matching = self.match.match(self.buffers["ref_kp_gpu"], kp, raw_results=True) # print(raw_matching.max(axis=0)) matching = numpy.recarray(shape=raw_matching.shape, dtype=MatchPlan.dtype_kp) len_match = raw_matching.shape[0] if len_match == 0: logger.warning("No matching keypoints") return matching[:, 0] = self.ref_kp[raw_matching[:, 0]] matching[:, 1] = kp[raw_matching[:, 1]] if orsa: if feature: matching = feature.sift_orsa(matching, self.shape, 1) else: logger.warning("feature is not available. No ORSA filtering") if (len_match < 3 * 6) or (shift_only): # 3 points per DOF if shift_only: logger.debug("Shift Only mode: Common keypoints: %s" % len_match) else: logger.warning("Shift Only mode: Common keypoints: %s" % len_match) dx = matching[:, 1].x - matching[:, 0].x dy = matching[:, 1].y - matching[:, 0].y matrix = numpy.identity(2, dtype=numpy.float32) offset = numpy.array([+numpy.median(dy), +numpy.median(dx)], numpy.float32) else: logger.debug("Common keypoints: %s" % len_match) transform_matrix = matching_correction(matching) offset = numpy.array([transform_matrix[5], transform_matrix[2]], dtype=numpy.float32) matrix = numpy.empty((2, 2), dtype=numpy.float32) matrix[0, 0], matrix[0, 1] = transform_matrix[4], transform_matrix[3] matrix[1, 0], matrix[1, 1] = transform_matrix[1], transform_matrix[0] if double_check and (len_match >= 3 * 6): # and abs(matrix - numpy.identity(2)).max() > 0.1: logger.warning("Validating keypoints, %s,%s" % (matrix, offset)) dx = matching[:, 1].x - matching[:, 0].x dy = matching[:, 1].y - matching[:, 0].y dangle = matching[:, 1].angle - matching[:, 0].angle dscale = numpy.log(matching[:, 1].scale / matching[:, 0].scale) distance = numpy.sqrt(dx * dx + dy * dy) outlayer = numpy.zeros(distance.shape, numpy.int8) outlayer += abs((distance - distance.mean()) / distance.std()) > 4 outlayer += abs((dangle - dangle.mean()) / dangle.std()) > 4 outlayer += abs((dscale - dscale.mean()) / dscale.std()) > 4 print(outlayer) outlayersum = outlayer.sum() if outlayersum > 0 and not numpy.isinf(outlayersum): matching2 = matching[outlayer == 0] transform_matrix = matching_correction(matching2) offset = numpy.array([transform_matrix[5], transform_matrix[2]], dtype=numpy.float32) matrix = numpy.empty((2, 2), dtype=numpy.float32) matrix[0, 0], matrix[0, 1] = transform_matrix[4], transform_matrix[3] matrix[1, 0], matrix[1, 1] = transform_matrix[1], transform_matrix[0] if relative: # update stable part to perform a relative alignment self.ref_kp = kp if self.ROI is not None: kpx = numpy.round(self.ref_kp.x).astype(numpy.int32) kpy = numpy.round(self.ref_kp.y).astype(numpy.int32) masked = self.ROI[(kpy, kpx)].astype(bool) logger.warning("Reducing keypoint list from %i to %i because of the ROI" % (self.ref_kp.size, masked.sum())) self.ref_kp = self.ref_kp[masked] self.buffers["ref_kp_gpu"] = pyopencl.array.to_device(self.match.queue, self.ref_kp) transfo = numpy.zeros((3, 3), dtype=numpy.float64) transfo[:2, :2] = matrix transfo[0, 2] = offset[0] transfo[1, 2] = offset[1] transfo[2, 2] = 1 if self.relative_transfo is None: self.relative_transfo = transfo else: self.relative_transfo = numpy.dot(transfo, self.relative_transfo) matrix = numpy.ascontiguousarray(self.relative_transfo[:2, :2], dtype=numpy.float32) offset = numpy.ascontiguousarray(self.relative_transfo[:2, 2], dtype=numpy.float32) # print(self.relative_transfo) cpy1 = pyopencl.enqueue_copy(self.queue, self.buffers["matrix"].data, matrix) cpy2 = pyopencl.enqueue_copy(self.queue, self.buffers["offset"].data, offset) if self.profile: self.events += [("Copy matrix", cpy1), ("Copy offset", cpy2)] if self.RGB: shape = (4, self.shape[1], self.shape[0]) transform = self.program.transform_RGB else: shape = self.shape[1], self.shape[0] transform = self.program.transform ev = transform(self.queue, calc_size(shape, self.wg), self.wg, self.buffers["input"].data, self.buffers["output"].data, self.buffers["matrix"].data, self.buffers["offset"].data, numpy.int32(self.shape[1]), numpy.int32(self.shape[0]), numpy.int32(self.outshape[1]), numpy.int32(self.outshape[0]), self.sift.buffers["min"].get()[0], numpy.int32(1)) if self.profile: self.events += [("transform", ev)] result = self.buffers["output"].get() # print (self.buffers["offset"]) if return_all: # corr = numpy.dot(matrix, numpy.vstack((matching[:, 1].y, matching[:, 1].x))).T - \ # offset.T - numpy.vstack((matching[:, 0].y, matching[:, 0].x)).T corr = numpy.dot(matrix, numpy.vstack((matching[:, 0].y, matching[:, 0].x))).T + offset.T - numpy.vstack((matching[:, 1].y, matching[:, 1].x)).T rms = numpy.sqrt((corr * corr).sum(axis=-1).mean()) # Todo: calculate the RMS of deplacement and return it: return {"result": result, "keypoint": kp, "matching": matching, "offset": offset, "matrix": matrix, "rms": rms} return result