Ejemplo n.º 1
0
    def _compact(self, start=numpy.int32(0)):
        """
        Compact the vector of keypoints starting from start

        :param start: start compacting at this adress. Before just copy
        :type  start: numpy.int32
        """
        wgsize = (self.max_workgroup_size,)  # (max(self.wgsize[0]),) #TODO: optimize
        #         kpsize32 = numpy.int32(self.kpsize)
        cp0_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data)
        kp_counter = self.cnt[0]
        procsize = calc_size((self.kpsize,), wgsize)

        if kp_counter > 0.9 * self.kpsize:
            logger.warning("Keypoint counter overflow risk: counted %s / %s" % (kp_counter, self.kpsize))
        logger.info("Compact %s -> %s / %s" % (start, kp_counter, self.kpsize))
        self.cnt[0] = start
        cp1_evt = pyopencl.enqueue_copy(self.queue, self.buffers["cnt"].data, self.cnt)
        evt = self.programs["algebra"].compact(
            self.queue,
            procsize,
            wgsize,
            self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
            self.buffers["Kp_2"].data,  # __global keypoint* output,
            self.buffers["cnt"].data,  # __global int* counter,
            start,  # int start,
            kp_counter,
        )  # int nbkeypoints
        cp2_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data)
        # swap keypoints:
        self.buffers["Kp_1"], self.buffers["Kp_2"] = self.buffers["Kp_2"], self.buffers["Kp_1"]
        # memset buffer Kp_2
        #        self.buffers["Kp_2"].fill(-1, self.queue)
        mem_evt = self.programs["memset"].memset_float(
            self.queue,
            calc_size((4 * self.kpsize,), wgsize),
            wgsize,
            self.buffers["Kp_2"].data,
            numpy.float32(-1),
            numpy.int32(4 * self.kpsize),
        )
        if self.profile:
            self.events += [
                ("copy cnt D->H", cp0_evt),
                ("copy cnt H->D", cp1_evt),
                ("compact", evt),
                ("copy cnt D->H", cp2_evt),
                ("memset 2", mem_evt),
            ]
        return self.cnt[0]
Ejemplo n.º 2
0
    def _compact(self, start=numpy.int32(0)):
        """
        Compact the vector of keypoints starting from start

        :param start: start compacting at this adress. Before just copy
        :type  start: numpy.int32
        """
        wgsize = self.max_workgroup_size,  # (max(self.wgsize[0]),) #TODO: optimize
        #         kpsize32 = numpy.int32(self.kpsize)
        cp0_evt = pyopencl.enqueue_copy(self.queue, self.cnt,
                                        self.buffers["cnt"].data)
        kp_counter = self.cnt[0]
        procsize = calc_size((self.kpsize, ), wgsize)

        if kp_counter > 0.9 * self.kpsize:
            logger.warning("Keypoint counter overflow risk: counted %s / %s" %
                           (kp_counter, self.kpsize))
        logger.info("Compact %s -> %s / %s" % (start, kp_counter, self.kpsize))
        self.cnt[0] = start
        cp1_evt = pyopencl.enqueue_copy(self.queue, self.buffers["cnt"].data,
                                        self.cnt)
        evt = self.programs["algebra"].compact(
            self.queue,
            procsize,
            wgsize,
            self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
            self.buffers["Kp_2"].data,  # __global keypoint* output,
            self.buffers["cnt"].data,  # __global int* counter,
            start,  # int start,
            kp_counter)  # int nbkeypoints
        cp2_evt = pyopencl.enqueue_copy(self.queue, self.cnt,
                                        self.buffers["cnt"].data)
        # swap keypoints:
        self.buffers["Kp_1"], self.buffers["Kp_2"] = self.buffers[
            "Kp_2"], self.buffers["Kp_1"]
        # memset buffer Kp_2
        #        self.buffers["Kp_2"].fill(-1, self.queue)
        mem_evt = self.programs["memset"].memset_float(
            self.queue, calc_size((4 * self.kpsize, ), wgsize), wgsize,
            self.buffers["Kp_2"].data, numpy.float32(-1),
            numpy.int32(4 * self.kpsize))
        if self.profile:
            self.events += [("copy cnt D->H", cp0_evt),
                            ("copy cnt H->D", cp1_evt), ("compact", evt),
                            ("copy cnt D->H", cp2_evt), ("memset 2", mem_evt)]
        return self.cnt[0]
Ejemplo n.º 3
0
    def align(self,
              img,
              shift_only=False,
              return_all=False,
              double_check=False,
              relative=False,
              orsa=False):
        """
        Align image on reference image

        :param img: numpy array containing the image to align to reference
        :param return_all: return in addition ot the image, keypoints, matching keypoints, and transformations as a dict
        :param reltive: update reference keypoints with those from current image to perform relative alignment
        :return: aligned image or all informations
        """
        logger.debug("ref_keypoints: %s" % self.ref_kp.size)
        if self.RGB:
            data = numpy.ascontiguousarray(img, numpy.uint8)
        else:
            data = numpy.ascontiguousarray(img, numpy.float32)
        with self.sem:
            cpy = pyopencl.enqueue_copy(self.queue, self.buffers["input"].data,
                                        data)
            if self.profile:
                self.events.append(("Copy H->D", cpy))
            cpy.wait()
            kp = self.sift.keypoints(self.buffers["input"])
            #            print("ref %s img %s" % (self.buffers["ref_kp_gpu"].shape, kp.shape))
            logger.debug("mod image keypoints: %s" % kp.size)
            raw_matching = self.match.match(self.buffers["ref_kp_gpu"],
                                            kp,
                                            raw_results=True)
            #            print(raw_matching.max(axis=0))

            matching = numpy.recarray(shape=raw_matching.shape,
                                      dtype=MatchPlan.dtype_kp)
            len_match = raw_matching.shape[0]
            if len_match == 0:
                logger.warning("No matching keypoints")
                return
            matching[:, 0] = self.ref_kp[raw_matching[:, 0]]
            matching[:, 1] = kp[raw_matching[:, 1]]

            if orsa:
                if feature:
                    matching = feature.sift_orsa(matching, self.shape, 1)
                else:
                    logger.warning(
                        "feature is not available. No ORSA filtering")

            if (len_match < 3 * 6) or (shift_only):  # 3 points per DOF
                if shift_only:
                    logger.debug("Shift Only mode: Common keypoints: %s" %
                                 len_match)
                else:
                    logger.warning("Shift Only mode: Common keypoints: %s" %
                                   len_match)
                dx = matching[:, 1].x - matching[:, 0].x
                dy = matching[:, 1].y - matching[:, 0].y
                matrix = numpy.identity(2, dtype=numpy.float32)
                offset = numpy.array([+numpy.median(dy), +numpy.median(dx)],
                                     numpy.float32)
            else:
                logger.debug("Common keypoints: %s" % len_match)

                transform_matrix = matching_correction(matching)
                offset = numpy.array(
                    [transform_matrix[5], transform_matrix[2]],
                    dtype=numpy.float32)
                matrix = numpy.empty((2, 2), dtype=numpy.float32)
                matrix[0,
                       0], matrix[0,
                                  1] = transform_matrix[4], transform_matrix[3]
                matrix[1,
                       0], matrix[1,
                                  1] = transform_matrix[1], transform_matrix[0]
            if double_check and (
                    len_match >=
                    3 * 6):  # and abs(matrix - numpy.identity(2)).max() > 0.1:
                logger.warning("Validating keypoints, %s,%s" %
                               (matrix, offset))
                dx = matching[:, 1].x - matching[:, 0].x
                dy = matching[:, 1].y - matching[:, 0].y
                dangle = matching[:, 1].angle - matching[:, 0].angle
                dscale = numpy.log(matching[:, 1].scale / matching[:, 0].scale)
                distance = numpy.sqrt(dx * dx + dy * dy)
                outlayer = numpy.zeros(distance.shape, numpy.int8)
                outlayer += abs(
                    (distance - distance.mean()) / distance.std()) > 4
                outlayer += abs((dangle - dangle.mean()) / dangle.std()) > 4
                outlayer += abs((dscale - dscale.mean()) / dscale.std()) > 4
                #                 print(outlayer)
                outlayersum = outlayer.sum()
                if outlayersum > 0 and not numpy.isinf(outlayersum):
                    matching2 = matching[outlayer == 0]
                    transform_matrix = matching_correction(matching2)
                    offset = numpy.array(
                        [transform_matrix[5], transform_matrix[2]],
                        dtype=numpy.float32)
                    matrix = numpy.empty((2, 2), dtype=numpy.float32)
                    matrix[0, 0], matrix[
                        0, 1] = transform_matrix[4], transform_matrix[3]
                    matrix[1, 0], matrix[
                        1, 1] = transform_matrix[1], transform_matrix[0]
            if relative:  # update stable part to perform a relative alignment
                self.ref_kp = kp
                if self.ROI is not None:
                    kpx = numpy.round(self.ref_kp.x).astype(numpy.int32)
                    kpy = numpy.round(self.ref_kp.y).astype(numpy.int32)
                    masked = self.ROI[(kpy, kpx)].astype(bool)
                    logger.warning(
                        "Reducing keypoint list from %i to %i because of the ROI"
                        % (self.ref_kp.size, masked.sum()))
                    self.ref_kp = self.ref_kp[masked]
                self.buffers["ref_kp_gpu"] = pyopencl.array.to_device(
                    self.match.queue, self.ref_kp)
                transfo = numpy.zeros((3, 3), dtype=numpy.float64)
                transfo[:2, :2] = matrix
                transfo[0, 2] = offset[0]
                transfo[1, 2] = offset[1]
                transfo[2, 2] = 1
                if self.relative_transfo is None:
                    self.relative_transfo = transfo
                else:
                    self.relative_transfo = numpy.dot(transfo,
                                                      self.relative_transfo)
                matrix = numpy.ascontiguousarray(self.relative_transfo[:2, :2],
                                                 dtype=numpy.float32)
                offset = numpy.ascontiguousarray(self.relative_transfo[:2, 2],
                                                 dtype=numpy.float32)
#                print(self.relative_transfo)
            cpy1 = pyopencl.enqueue_copy(self.queue,
                                         self.buffers["matrix"].data, matrix)
            cpy2 = pyopencl.enqueue_copy(self.queue,
                                         self.buffers["offset"].data, offset)
            if self.profile:
                self.events += [("Copy matrix", cpy1), ("Copy offset", cpy2)]

            if self.RGB:
                shape = (4, self.shape[1], self.shape[0])
                transform = self.program.transform_RGB
            else:
                shape = self.shape[1], self.shape[0]
                transform = self.program.transform
#             print(kernel_workgroup_size(self.program, transform), self.wg, self.ctx.devices[0].max_work_item_sizes)
            ev = transform(self.queue, calc_size(shape, self.wg), self.wg,
                           self.buffers["input"].data,
                           self.buffers["output"].data,
                           self.buffers["matrix"].data,
                           self.buffers["offset"].data,
                           numpy.int32(self.shape[1]),
                           numpy.int32(self.shape[0]),
                           numpy.int32(self.outshape[1]),
                           numpy.int32(self.outshape[0]),
                           self.sift.buffers["min"].get()[0], numpy.int32(1))
            if self.profile:
                self.events += [("transform", ev)]
            result = self.buffers["output"].get()

#        print (self.buffers["offset"])
        if return_all:
            #            corr = numpy.dot(matrix, numpy.vstack((matching[:, 1].y, matching[:, 1].x))).T - \
            #                   offset.T - numpy.vstack((matching[:, 0].y, matching[:, 0].x)).T
            corr = numpy.dot(
                matrix, numpy.vstack(
                    (matching[:, 0].y,
                     matching[:, 0].x))).T + offset.T - numpy.vstack(
                         (matching[:, 1].y, matching[:, 1].x)).T
            rms = numpy.sqrt((corr * corr).sum(axis=-1).mean())

            # Todo: calculate the RMS of deplacement and return it:
            return {
                "result": result,
                "keypoint": kp,
                "matching": matching,
                "offset": offset,
                "matrix": matrix,
                "rms": rms
            }
        return result
Ejemplo n.º 4
0
    def _one_octave(self, octave):
        """
        Does all scales within an octave

        :param octave: number of the octave
        """
        prevSigma = self._init_sigma
        logger.info("Calculating octave %i" % octave)
        wgsize = (128,)  # (max(self.wgsize[octave]),) #TODO: optimize
        kpsize32 = numpy.int32(self.kpsize)
        self._reset_keypoints()
        octsize = numpy.int32(2 ** octave)
        last_start = numpy.int32(0)
        for scale in range(par.Scales + 2):
            sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0)
            logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma))

            ########################################################################
            # Calculate gaussian blur and DoG
            ########################################################################

            self._gaussian_convolution(self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave)
            prevSigma *= self.sigmaRatio
            evt = self.kernels.get_kernel("combine")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                     self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0),
                                                     self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0),
                                                     self.cl_mem["DoGs"].data, numpy.int32(scale),
                                                     *self.scales[octave])
            if self.profile:
                self.events.append(("DoG %s %s" % (octave, scale), evt))
        for scale in range(1, par.Scales + 1):
            evt = self.kernels.get_kernel("local_maxmin")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                          self.cl_mem["DoGs"].data,  # __global float* DOGS,
                                                          self.cl_mem["Kp_1"].data,  # __global keypoint* output,
                                                          numpy.int32(par.BorderDist),  # int border_dist,
                                                          numpy.float32(par.PeakThresh),  # float peak_thresh,
                                                          octsize,  # int octsize,
                                                          numpy.float32(par.EdgeThresh1),  # float EdgeThresh0,
                                                          numpy.float32(par.EdgeThresh),  # float EdgeThresh,
                                                          self.cl_mem["cnt"].data,  # __global int* counter,
                                                          kpsize32,  # int nb_keypoints,
                                                          numpy.int32(scale),  # int scale,
                                                          *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events.append(("local_maxmin %s %s" % (octave, scale), evt))
            procsize = calc_size((self.kpsize,), wgsize)
            cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
            # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory
            evt = self.kernels.get_kernel("interp_keypoint")(self.queue, procsize, wgsize,
                                                             self.cl_mem["DoGs"].data,  # __global float* DOGS,
                                                             self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                                             last_start,  # int start_keypoint,
                                                             self.cnt[0],  # int end_keypoint,
                                                             numpy.float32(par.PeakThresh),  # float peak_thresh,
                                                             numpy.float32(self._init_sigma),  # float InitSigma,
                                                             *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events += [("get cnt", cp_evt),
                                ("interp_keypoint %s %s" % (octave, scale), evt)
                                ]

            newcnt = self._compact(last_start)
            evt = self.kernels.get_kernel("compute_gradient_orientation")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                                          self.cl_mem["scale_%s" % (scale)].data,  # __global float* igray,
                                                                          self.cl_mem["tmp"].data,  # __global float *grad,
                                                                          self.cl_mem["ori"].data,  # __global float *ori,
                                                                          *self.scales[octave])  # int width,int height
            if self.profile:
                self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt))

#           Orientation assignement: 1D kernel, rather heavy kernel
            if newcnt and newcnt > last_start:  # launch kernel only if neededwgsize = (128,)

                if self.USE_CPU:
                    orientation_name = "orientation_cpu"
                    scales = self.scales[octave]
                else:
                    orientation_name = "orientation_gpu"
                    scales = list(self.scales[octave]) + \
                             [pyopencl.LocalMemory(36 * 4),
                              pyopencl.LocalMemory(128 * 4),
                              pyopencl.LocalMemory(128 * 4)]
                orientation = self.kernels.get_kernel(orientation_name)
                wg = self.kernels_max_wg_size[orientation_name]
                wgsize2 = (wg,)
                procsize = (int(newcnt * wg),)
                evt = orientation(self.queue, procsize, wgsize2,
                                  self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                  self.cl_mem["tmp"].data,  # __global float* grad,
                                  self.cl_mem["ori"].data,  # __global float* ori,
                                  self.cl_mem["cnt"].data,  # __global int* counter,
                                  octsize,  # int octsize,
                                  numpy.float32(par.OriSigma),  # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)"
                                  kpsize32,  # int max of nb_keypoints,
                                  numpy.int32(last_start),  # int keypoints_start,
                                  newcnt,  # int keypoints_end,
                                  *scales)  # int grad_width, int grad_height)
                # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above !
                evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
                newcnt = self.cnt[0]  # do not forget to update numbers of keypoints, modified above !

                for _ in range(3):
                    # up to 3 attempts
                    if self.USE_CPU or (self.LOW_END > 1):
                        logger.info("Computing descriptors with CPU optimized kernels")
                        descriptor_name = "descriptor_cpu"
                        wg = self.kernels_max_wg_size[descriptor_name][0]
                        wgsize2 = (wg,)
                        procsize2 = (int(newcnt * wg),)
                    else:
                        if self.LOW_END:
                            logger.info("Computing descriptors with older-GPU optimized kernels")
                            descriptor_name = "descriptor_gpu1"
                            wgsize2 = self.kernels_max_wg_size[descriptor_name]
                            procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
#                             if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2):
#                                 # will fail anyway:
#                                 self.LOW_END += 1
#                                 continue
                        else:
                            logger.info("Computing descriptors with newer-GPU optimized kernels")
                            descriptor_name = "descriptor_gpu2"
                            wgsize2 = self.kernels_max_wg_size[descriptor_name]
                            procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
#                             if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2):
#                                 # will fail anyway:
#                                 self.LOW_END += 1
#                                 continue
                    try:
                        descriptor = self.kernels.get_kernel(descriptor_name)
                        evt2 = descriptor(self.queue, procsize2, wgsize2,
                                          self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                          self.cl_mem["descriptors"].data,  # ___global unsigned char *descriptors
                                          self.cl_mem["tmp"].data,  # __global float* grad,
                                          self.cl_mem["ori"].data,  # __global float* ori,
                                          octsize,  # int octsize,
                                          numpy.int32(last_start),  # int keypoints_start,
                                          self.cl_mem["cnt"].data,  # int* keypoints_end,
                                          *self.scales[octave])  # int grad_width, int grad_height)
                        evt2.wait()
                    except (pyopencl.RuntimeError, pyopencl._cl.LogicError) as error:
                        self.LOW_END += 1
                        logger.error("Descriptor failed with %s. Switching to lower_end mode" % error)
                        continue
                    else:
                        break
                if self.profile:
                    self.events += [("%s %s %s" % (orientation_name, octave, scale), evt),
                                    ("copy cnt D->H", evt_cp),
                                    ("%s %s %s" % (descriptor_name, octave, scale), evt2)]
            evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
            last_start = self.cnt[0]
            if self.profile:
                self.events.append(("copy cnt D->H", evt_cp))

        ########################################################################
        # Rescale all images to populate all octaves
        ########################################################################
        if octave < self.octave_max - 1:
            evt = self.kernels.get_kernel("shrink")(self.queue, self.procsize[octave + 1], self.wgsize[octave + 1],
                                                    self.cl_mem["scale_%i" % (par.Scales)].data,
                                                    self.cl_mem["scale_0"].data,
                                                    numpy.int32(2), numpy.int32(2),
                                                    self.scales[octave][0], self.scales[octave][1],
                                                    *self.scales[octave + 1])
            if self.profile:
                self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt))
        results = numpy.empty((last_start, 4), dtype=numpy.float32)
        descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8)
        if last_start:
            evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data)
            evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.cl_mem["descriptors"].data)
            if self.profile:
                self.events += [("copy D->H", evt),
                                ("copy D->H", evt2)]
        return results, descriptors
Ejemplo n.º 5
0
    def keypoints(self, image, mask=None):
        """Calculates the keypoints of the image

        TODO: use a temporary list with events and use a single test at the end

        :param image: ndimage of 2D (or 3D if RGB)
        :param mask: TODO: implement a mask for sieving out the keypoints
        :return: vector of keypoint (1D numpy array)
        """
        # self.reset_timer()
        with self.sem:
            total_size = 0
            keypoints = []
            descriptors = []
            assert image.shape[:2] == self.shape
            assert image.dtype in [self.dtype, numpy.float32]
            # old versions of pyopencl do not check for data contiguity
            if not(isinstance(image, pyopencl.array.Array)) and not(image.flags["C_CONTIGUOUS"]):
                image = numpy.ascontiguousarray(image)
            t0 = time.time()

            if image.dtype == numpy.float32:
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif self.dtype == numpy.float64:
                # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64)
                # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32
                #       and uncomment it if it proves faster (dubious, because of data transfer bottleneck)
                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.astype(numpy.float32))
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB):
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))

                evt = self.kernels.get_kernel("rgb_to_float")(self.queue, self.procsize[0], self.wgsize[0],
                                                       self.cl_mem["raw"].data, self.cl_mem["scale_0"].data,
                                                       *self.scales[0])
                if self.profile:
                    self.events.append(("RGB -> float", evt))

            elif self.dtype in self.converter:
                program = self.kernels.get_kernel(self.converter[self.dtype])
                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
                evt = program(self.queue, self.procsize[0], self.wgsize[0],
                              self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0])
                if self.profile:
                    self.events.append(("convert -> float", evt))
            else:
                raise RuntimeError("invalid input format error (%s)" % (str(self.dtype)))

            wg1 = self.kernels_wg["max_min_global_stage1"]
            wg2 = self.kernels_wg["max_min_global_stage2"]
            if min(wg1, wg2) < self.red_size:
                # common bug on OSX when running on CPU
                logger.info("Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s",
                            wg1, wg2, self.block_size, self.red_size)
                kernel = self.kernels.get_kernel("max_min_vec16")
                k = kernel(self.queue, (1,), (1,),
                               self.cl_mem["scale_0"].data,
                               numpy.int32(self.shape[0] * self.shape[1]),
                               self.cl_mem["max"].data,
                               self.cl_mem["min"].data)
                if self.profile:
                    self.events.append(("max_min_serial", k))
                # python implementation:
                # buffer_ = self.cl_mem["scale_0"].get()
                # self.cl_mem["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32))
                # self.cl_mem["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32))
            else:
                kernel1 = self.kernels.get_kernel("max_min_global_stage1")
                kernel2 = self.kernels.get_kernel("max_min_global_stage2")
                # logger.debug("self.red_size: %s", self.red_size)
                shm = pyopencl.LocalMemory(self.red_size * 2 * 4)
                k1 = kernel1(self.queue, (self.red_size * self.red_size,), (self.red_size,),
                             self.cl_mem["scale_0"].data,
                             self.cl_mem["max_min"].data,
                             numpy.int32(self.shape[0] * self.shape[1]),
                             shm)
                k2 = kernel2(self.queue, (self.red_size,), (self.red_size,),
                             self.cl_mem["max_min"].data,
                             self.cl_mem["max"].data,
                             self.cl_mem["min"].data,
                             shm)

                if self.profile:
                    self.events.append(("max_min_stage1", k1))
                    self.events.append(("max_min_stage2", k2))

            evt = self.kernels.get_kernel("normalizes")(self.queue, self.procsize[0], self.wgsize[0],
                                                        self.cl_mem["scale_0"].data,
                                                        self.cl_mem["min"].data,
                                                        self.cl_mem["max"].data,
                                                        self.cl_mem["255"].data,
                                                        *self.scales[0])
            if self.profile:
                self.events.append(("normalize", evt))

            curSigma = 1.0 if par.DoubleImSize else 0.5
            octave = 0
            if self._init_sigma > curSigma:
                logger.debug("Bluring image to achieve std: %f", self._init_sigma)
                sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2)
                self._gaussian_convolution(self.cl_mem["scale_0"], self.cl_mem["scale_0"], sigma, 0)

            for octave in range(self.octave_max):
                kp, descriptor = self._one_octave(octave)
                logger.info("in octave %i found %i kp" % (octave, kp.shape[0]))

                if len(kp):
                    # sieve out coordinates with NaNs
                    mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1))))
                    keypoints.append(kp[mask])
                    descriptors.append(descriptor[mask])
                    total_size += len(mask[0])

            ########################################################################
            # Merge keypoints in central memory
            ########################################################################
            output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp)
            last = 0
            for ds, desc in zip(keypoints, descriptors):
                l = ds.shape[0]
                if l > 0:
                    output[last:last + l].x = ds[:, 0]
                    output[last:last + l].y = ds[:, 1]
                    output[last:last + l].scale = ds[:, 2]
                    output[last:last + l].angle = ds[:, 3]
                    output[last:last + l].desc = desc
                    last += l
            logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0)))
        return output
Ejemplo n.º 6
0
    def _one_octave(self, octave):
        """
        Does all scales within an octave

        :param octave: number of the octave
        """
        prevSigma = self._init_sigma
        logger.info("Calculating octave %i" % octave)
        wgsize = (128,)  # (max(self.wgsize[octave]),) #TODO: optimize
        kpsize32 = numpy.int32(self.kpsize)
        self._reset_keypoints()
        octsize = numpy.int32(2 ** octave)
        last_start = numpy.int32(0)
        for scale in range(par.Scales + 2):
            sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0)
            logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma))

            ########################################################################
            # Calculate gaussian blur and DoG
            ########################################################################

            self._gaussian_convolution(self.buffers[scale], self.buffers[scale + 1], sigma, octave)
            prevSigma *= self.sigmaRatio
            evt = self.programs["algebra"].combine(
                self.queue,
                self.procsize[octave],
                self.wgsize[octave],
                self.buffers[scale + 1].data,
                numpy.float32(-1.0),
                self.buffers[scale].data,
                numpy.float32(+1.0),
                self.buffers["DoGs"].data,
                numpy.int32(scale),
                *self.scales[octave]
            )
            if self.profile:
                self.events.append(("DoG %s %s" % (octave, scale), evt))
        for scale in range(1, par.Scales + 1):
            evt = self.programs["image"].local_maxmin(
                self.queue,
                self.procsize[octave],
                self.wgsize[octave],
                self.buffers["DoGs"].data,  # __global float* DOGS,
                self.buffers["Kp_1"].data,  # __global keypoint* output,
                numpy.int32(par.BorderDist),  # int border_dist,
                numpy.float32(par.PeakThresh),  # float peak_thresh,
                octsize,  # int octsize,
                numpy.float32(par.EdgeThresh1),  # float EdgeThresh0,
                numpy.float32(par.EdgeThresh),  # float EdgeThresh,
                self.buffers["cnt"].data,  # __global int* counter,
                kpsize32,  # int nb_keypoints,
                numpy.int32(scale),  # int scale,
                *self.scales[octave]
            )  # int width, int height)
            if self.profile:
                self.events.append(("local_maxmin %s %s" % (octave, scale), evt))
            procsize = calc_size((self.kpsize,), wgsize)
            cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data)
            # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory
            evt = self.programs["image"].interp_keypoint(
                self.queue,
                procsize,
                wgsize,
                self.buffers["DoGs"].data,  # __global float* DOGS,
                self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
                last_start,  # int start_keypoint,
                self.cnt[0],  # int end_keypoint,
                numpy.float32(par.PeakThresh),  # float peak_thresh,
                numpy.float32(self._init_sigma),  # float InitSigma,
                *self.scales[octave]
            )  # int width, int height)
            if self.profile:
                self.events += [("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt)]

            newcnt = self._compact(last_start)
            evt = self.programs["image"].compute_gradient_orientation(
                self.queue,
                self.procsize[octave],
                self.wgsize[octave],
                self.buffers[scale].data,  # __global float* igray,
                self.buffers["tmp"].data,  # __global float *grad,
                self.buffers["ori"].data,  # __global float *ori,
                *self.scales[octave]
            )  # int width,int height
            if self.profile:
                self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt))

            #           Orientation assignement: 1D kernel, rather heavy kernel
            if newcnt and newcnt > last_start:  # launch kernel only if neededwgsize = (128,)

                if self.USE_CPU:
                    file_to_use = "orientation_cpu"
                #                    logger.info("Computing orientation with CPU-optimized kernels")
                else:
                    file_to_use = "orientation_gpu"

                wgsize2 = (self.kernels[file_to_use],)
                procsize = (int(newcnt * wgsize2[0]),)
                evt = self.programs[file_to_use].orientation_assignment(
                    self.queue,
                    procsize,
                    wgsize2,
                    self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
                    self.buffers["tmp"].data,  # __global float* grad,
                    self.buffers["ori"].data,  # __global float* ori,
                    self.buffers["cnt"].data,  # __global int* counter,
                    octsize,  # int octsize,
                    numpy.float32(par.OriSigma),  # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)"
                    kpsize32,  # int max of nb_keypoints,
                    numpy.int32(last_start),  # int keypoints_start,
                    newcnt,  # int keypoints_end,
                    *self.scales[octave]
                )  # int grad_width, int grad_height)
                # newcnt = self.buffers["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above !
                evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data)
                newcnt = self.cnt[0]  # do not forget to update numbers of keypoints, modified above !

                for i_not_used in range(3):
                    # up to 3 attempts
                    if (not self.USE_CPU) and (self.LOW_END == 0) and ("keypoints_gpu2" in self.kernels):
                        file_to_use = "keypoints_gpu2"
                        logger.info("Computing descriptors with newer-GPU optimized kernels")
                        wgsize2 = self.kernels[file_to_use]
                        procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
                    elif (not self.USE_CPU) and (self.LOW_END == 1) and ("keypoints_gpu1" in self.kernels):
                        file_to_use = "keypoints_gpu1"
                        logger.info("Computing descriptors with older-GPU optimized kernels")
                        wgsize2 = self.kernels[file_to_use]
                        procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
                    else:
                        # self.USE_CPU or self.LOW_END == 2, fail-safe fall-back
                        file_to_use = "keypoints_cpu"
                        logger.info("Computing descriptors with CPU optimized kernels")
                        wgsize2 = (self.kernels[file_to_use],)
                        procsize2 = (int(newcnt * wgsize2[0]),)
                    try:
                        evt2 = self.programs[file_to_use].descriptor(
                            self.queue,
                            procsize2,
                            wgsize2,
                            self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
                            self.buffers["descriptors"].data,  # ___global unsigned char *descriptors
                            self.buffers["tmp"].data,  # __global float* grad,
                            self.buffers["ori"].data,  # __global float* ori,
                            octsize,  # int octsize,
                            numpy.int32(last_start),  # int keypoints_start,
                            self.buffers["cnt"].data,  # int* keypoints_end,
                            *self.scales[octave]
                        )  # int grad_width, int grad_height)
                    except pyopencl.RuntimeError as error:
                        self.LOW_END += 1
                        logger.error("Descriptor failed with %s. Switching to lower_end mode" % error)
                        continue
                    else:
                        break
                if self.profile:
                    self.events += [
                        ("orientation_assignment %s %s" % (octave, scale), evt),
                        ("copy cnt D->H", evt_cp),
                        ("descriptors %s %s" % (octave, scale), evt2),
                    ]

            evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.buffers["cnt"].data)
            last_start = self.cnt[0]
            if self.profile:
                self.events.append(("copy cnt D->H", evt_cp))

        ########################################################################
        # Rescale all images to populate all octaves
        ########################################################################
        if octave < self.octave_max - 1:
            evt = self.programs["preprocess"].shrink(
                self.queue,
                self.procsize[octave + 1],
                self.wgsize[octave + 1],
                self.buffers[par.Scales].data,
                self.buffers[0].data,
                numpy.int32(2),
                numpy.int32(2),
                self.scales[octave][0],
                self.scales[octave][1],
                *self.scales[octave + 1]
            )
            if self.profile:
                self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt))
        results = numpy.empty((last_start, 4), dtype=numpy.float32)
        descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8)
        if last_start:
            evt = pyopencl.enqueue_copy(self.queue, results, self.buffers["Kp_1"].data)
            evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.buffers["descriptors"].data)
            if self.profile:
                self.events += [("copy D->H", evt), ("copy D->H", evt2)]
        return results, descriptors
Ejemplo n.º 7
0
    def keypoints(self, image):
        """Calculates the keypoints of the image

        :param image: ndimage of 2D (or 3D if RGB)
        :return: vector of keypoint (1D numpy array)
        """
        self.reset_timer()
        with self._sem:
            total_size = 0
            keypoints = []
            descriptors = []
            assert image.shape[:2] == self.shape
            assert image.dtype in [self.dtype, numpy.float32]
            # old versions of pyopencl do not check for data contiguity
            if not (isinstance(image, pyopencl.array.Array)) and not (image.flags["C_CONTIGUOUS"]):
                image = numpy.ascontiguousarray(image)
            t0 = time.time()

            if image.dtype == numpy.float32:
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif self.dtype == numpy.float64:
                # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64)
                # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32
                #       and uncomment it if it proves faster (dubious, because of data transfer bottleneck)
                evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image.astype(numpy.float32))
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB):
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))

                evt = self.programs["preprocess"].rgb_to_float(
                    self.queue,
                    self.procsize[0],
                    self.wgsize[0],
                    self.buffers["raw"].data,
                    self.buffers[0].data,
                    *self.scales[0]
                )
                if self.profile:
                    self.events.append(("RGB -> float", evt))

            elif self.dtype in self.converter:
                program = self.programs["preprocess"].__getattr__(self.converter[self.dtype])
                evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
                evt = program(
                    self.queue,
                    self.procsize[0],
                    self.wgsize[0],
                    self.buffers["raw"].data,
                    self.buffers[0].data,
                    *self.scales[0]
                )
                if self.profile:
                    self.events.append(("convert -> float", evt))
            else:
                raise RuntimeError("invalid input format error (%s)" % (str(self.dtype)))

            wg1 = self.kernels["reductions.max_min_global_stage1"]
            wg2 = self.kernels["reductions.max_min_global_stage2"]
            if min(wg1, wg2) < self.red_size:
                # common bug on OSX when running on CPU
                logger.info(
                    "Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s",
                    wg1,
                    wg2,
                    self.max_workgroup_size,
                    self.red_size,
                )
                kernel = self.programs["reductions"].max_min_serial
                k = kernel(
                    self.queue,
                    (1,),
                    (1,),
                    self.buffers[0].data,
                    numpy.uint32(self.shape[0] * self.shape[1]),
                    self.buffers["max"].data,
                    self.buffers["min"].data,
                )
                if self.profile:
                    self.events.append(("max_min_serial", k))
                # python implementation:
                # buffer_ = self.buffers[0].get()
                # self.buffers["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32))
                # self.buffers["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32))
            else:
                kernel1 = self.programs["reductions"].max_min_global_stage1
                kernel2 = self.programs["reductions"].max_min_global_stage2
                # logger.debug("self.red_size: %s", self.red_size)
                k1 = kernel1(
                    self.queue,
                    (self.red_size * self.red_size,),
                    (self.red_size,),
                    self.buffers[0].data,
                    self.buffers["max_min"].data,
                    numpy.uint32(self.shape[0] * self.shape[1]),
                )
                k2 = kernel2(
                    self.queue,
                    (self.red_size,),
                    (self.red_size,),
                    self.buffers["max_min"].data,
                    self.buffers["max"].data,
                    self.buffers["min"].data,
                )

                if self.profile:
                    self.events.append(("max_min_stage1", k1))
                    self.events.append(("max_min_stage2", k2))

            evt = self.programs["preprocess"].normalizes(
                self.queue,
                self.procsize[0],
                self.wgsize[0],
                self.buffers[0].data,
                self.buffers["min"].data,
                self.buffers["max"].data,
                self.buffers["255"].data,
                *self.scales[0]
            )
            if self.profile:
                self.events.append(("normalize", evt))

            curSigma = 1.0 if par.DoubleImSize else 0.5
            octave = 0
            if self._init_sigma > curSigma:
                logger.debug("Bluring image to achieve std: %f", self._init_sigma)
                sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2)
                self._gaussian_convolution(self.buffers[0], self.buffers[0], sigma, 0)

            for octave in range(self.octave_max):
                kp, descriptor = self._one_octave(octave)
                logger.info("in octave %i found %i kp" % (octave, kp.shape[0]))

                if len(kp):
                    # sieve out coordinates with NaNs
                    mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1))))
                    keypoints.append(kp[mask])
                    descriptors.append(descriptor[mask])
                    total_size += len(mask[0])

            ########################################################################
            # Merge keypoints in central memory
            ########################################################################
            output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp)
            last = 0
            for ds, desc in zip(keypoints, descriptors):
                l = ds.shape[0]
                if l > 0:
                    output[last : last + l].x = ds[:, 0]
                    output[last : last + l].y = ds[:, 1]
                    output[last : last + l].scale = ds[:, 2]
                    output[last : last + l].angle = ds[:, 3]
                    output[last : last + l].desc = desc
                    last += l
            logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0)))
        return output
Ejemplo n.º 8
0
    def match(self, nkp1, nkp2, raw_results=False):
        """Calculate the matching of 2 keypoint list

        :param nkp1, nkp2: numpy 1D recarray of keypoints or equivalent GPU buffer
        :param raw_results: if true return the 2D array of indexes of matching keypoints (not the actual keypoints)

        TODO: implement the ROI ...

        """
        assert len(nkp1.shape) == 1  # Nota: nkp1.ndim is not valid for gpu_arrays
        assert len(nkp2.shape) == 1
        valid_types = (numpy.ndarray, numpy.core.records.recarray, pyopencl.array.Array)
        assert isinstance(nkp1, valid_types)
        assert isinstance(nkp2, valid_types)
        result = None
        with self._sem:
            if isinstance(nkp1, pyopencl.array.Array):

                kpt1_gpu = nkp1
            else:
                if nkp1.size > self.buffers["Kp_1"].size:
                    logger.warning("increasing size of keypoint vector 1 to %i" % nkp1.size)
                    self.buffers["Kp_1"] = pyopencl.array.empty(self.queue, (nkp1.size,), dtype=self.dtype_kp)
                kpt1_gpu = self.buffers["Kp_1"]
                self._reset_buffer1()
                evt1 = pyopencl.enqueue_copy(self.queue, kpt1_gpu.data, nkp1)
                if self.profile:
                    self.events.append(("copy H->D KP_1", evt1))

            if isinstance(nkp2, pyopencl.array.Array):
                kpt2_gpu = nkp2
            else:
                if nkp2.size > self.buffers["Kp_2"].size:
                    logger.warning("increasing size of keypoint vector 2 to %i" % nkp2.size)
                    self.buffers["Kp_2"] = pyopencl.array.empty(self.queue, (nkp2.size,), dtype=self.dtype_kp)
                kpt2_gpu = self.buffers["Kp_2"]
                self._reset_buffer2()
                evt2 = pyopencl.enqueue_copy(self.queue, kpt2_gpu.data, nkp2)
                if self.profile:
                    self.events.append(("copy H->D KP_2", evt2))

            if min(kpt1_gpu.size, kpt2_gpu.size) > self.buffers["match"].shape[0]:
                self.kpsize = min(kpt1_gpu.size, kpt2_gpu.size)
                self.buffers["match"] = pyopencl.array.empty(self.queue, (self.kpsize, 2), dtype=numpy.int32)
            self._reset_output()
            wg = self.kernels[self.matching_kernel+".matching"]
            size = calc_size((nkp1.size,), (wg,))
            evt = self.programs[self.matching_kernel].matching(self.queue, size, (wg,),
                                                               kpt1_gpu.data,
                                                               kpt2_gpu.data,
                                                               self.buffers["match"].data,
                                                               self.buffers["cnt"].data,
                                                               numpy.int32(self.kpsize),
                                                               numpy.float32(par.MatchRatio * par.MatchRatio),
                                                               numpy.int32(nkp1.size),
                                                               numpy.int32(nkp2.size))
            if self.profile:
                self.events.append(("matching", evt))
            size = self.buffers["cnt"].get()[0]
            match = numpy.empty(shape=(size, 2), dtype=numpy.int32)
            if size > 0:
                cpyD2H = pyopencl.enqueue_copy(self.queue, match, self.buffers["match"].data)
            if self.profile:
                self.events.append(("copy D->H match", cpyD2H))
            if raw_results:
                result = match
            else:
                result = numpy.recarray(shape=(size, 2), dtype=self.dtype_kp)

                result[:, 0] = nkp1[match[:size, 0]]
                result[:, 1] = nkp2[match[:size, 1]]
        return result
Ejemplo n.º 9
0
Archivo: plan.py Proyecto: PiRK/silx
    def keypoints(self, image):
        """Calculates the keypoints of the image

        :param image: ndimage of 2D (or 3D if RGB)
        :return: vector of keypoint (1D numpy array)
        """
        self.reset_timer()
        with self._sem:
            total_size = 0
            keypoints = []
            descriptors = []
            assert image.shape[:2] == self.shape
            assert image.dtype in [self.dtype, numpy.float32]
            t0 = time.time()

            if image.dtype == numpy.float32:
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers[0].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB):
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))

                evt = self.programs["preprocess"].rgb_to_float(self.queue, self.procsize[0], self.wgsize[0],
                                                               self.buffers["raw"].data, self.buffers[0].data,
                                                               *self.scales[0])
                if self.profile:
                    self.events.append(("RGB -> float", evt))

            elif self.dtype in self.converter:
                program = self.programs["preprocess"].__getattr__(self.converter[self.dtype])
                evt = pyopencl.enqueue_copy(self.queue, self.buffers["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
                evt = program(self.queue, self.procsize[0], self.wgsize[0],
                              self.buffers["raw"].data, self.buffers[0].data, *self.scales[0])
                if self.profile:
                    self.events.append(("convert -> float", evt))
            else:
                raise RuntimeError("invalid input format error")

            k1 = self.programs["reductions"].max_min_global_stage1(self.queue, (self.red_size * self.red_size,), (self.red_size,),
                                                                   self.buffers[0].data,
                                                                   self.buffers["max_min"].data,
                                                                   numpy.uint32(self.shape[0] * self.shape[1]))
            k2 = self.programs["reductions"].max_min_global_stage2(self.queue, (self.red_size,), (self.red_size,),
                                                                   self.buffers["max_min"].data,
                                                                   self.buffers["max"].data,
                                                                   self.buffers["min"].data)
            if self.profile:
                self.events.append(("max_min_stage1", k1))
                self.events.append(("max_min_stage2", k2))
            evt = self.programs["preprocess"].normalizes(self.queue, self.procsize[0], self.wgsize[0],
                                                         self.buffers[0].data,
                                                         self.buffers["min"].data,
                                                         self.buffers["max"].data,
                                                         self.buffers["255"].data,
                                                         *self.scales[0])
            if self.profile:
                self.events.append(("normalize", evt))

            curSigma = 1.0 if par.DoubleImSize else 0.5
            octave = 0
            if self._init_sigma > curSigma:
                logger.debug("Bluring image to achieve std: %f", self._init_sigma)
                sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2)
                self._gaussian_convolution(self.buffers[0], self.buffers[0], sigma, 0)

            for octave in range(self.octave_max):
                kp, descriptor = self._one_octave(octave)
                logger.info("in octave %i found %i kp" % (octave, kp.shape[0]))

                if kp.shape[0] > 0:
                    keypoints.append(kp)
                    descriptors.append(descriptor)
                    total_size += kp.shape[0]

            ########################################################################
            # Merge keypoints in central memory
            ########################################################################
            output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp)
            last = 0
            for ds, desc in zip(keypoints, descriptors):
                l = ds.shape[0]
                if l > 0:
                    output[last:last + l].x = ds[:, 0]
                    output[last:last + l].y = ds[:, 1]
                    output[last:last + l].scale = ds[:, 2]
                    output[last:last + l].angle = ds[:, 3]
                    output[last:last + l].desc = desc
                    last += l
            logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0)))
        return output
Ejemplo n.º 10
0
    def match(self, nkp1, nkp2, raw_results=False):
        """Calculate the matching of 2 keypoint list

        :param nkp1: numpy 1D recarray of keypoints or equivalent GPU buffer
        :param nkp2: numpy 1D recarray of keypoints or equivalent GPU buffer
        :param raw_results: if true return the 2D array of indexes of matching keypoints (not the actual keypoints)

        TODO: implement the ROI ...
        """
        assert len(
            nkp1.shape) == 1  # Nota: nkp1.ndim is not valid for gpu_arrays
        assert len(nkp2.shape) == 1
        valid_types = (numpy.ndarray, numpy.core.records.recarray,
                       pyopencl.array.Array)
        assert isinstance(nkp1, valid_types)
        assert isinstance(nkp2, valid_types)
        result = None
        with self._sem:
            if isinstance(nkp1, pyopencl.array.Array):

                kpt1_gpu = nkp1
            else:
                if nkp1.size > self.buffers["Kp_1"].size:
                    logger.warning(
                        "increasing size of keypoint vector 1 to %i" %
                        nkp1.size)
                    self.buffers["Kp_1"] = pyopencl.array.empty(
                        self.queue, (nkp1.size, ), dtype=self.dtype_kp)
                kpt1_gpu = self.buffers["Kp_1"]
                self._reset_buffer1()
                evt1 = pyopencl.enqueue_copy(self.queue, kpt1_gpu.data, nkp1)
                if self.profile:
                    self.events.append(("copy H->D KP_1", evt1))

            if isinstance(nkp2, pyopencl.array.Array):
                kpt2_gpu = nkp2
            else:
                if nkp2.size > self.buffers["Kp_2"].size:
                    logger.warning(
                        "increasing size of keypoint vector 2 to %i" %
                        nkp2.size)
                    self.buffers["Kp_2"] = pyopencl.array.empty(
                        self.queue, (nkp2.size, ), dtype=self.dtype_kp)
                kpt2_gpu = self.buffers["Kp_2"]
                self._reset_buffer2()
                evt2 = pyopencl.enqueue_copy(self.queue, kpt2_gpu.data, nkp2)
                if self.profile:
                    self.events.append(("copy H->D KP_2", evt2))

            if min(kpt1_gpu.size,
                   kpt2_gpu.size) > self.buffers["match"].shape[0]:
                self.kpsize = min(kpt1_gpu.size, kpt2_gpu.size)
                self.buffers["match"] = pyopencl.array.empty(self.queue,
                                                             (self.kpsize, 2),
                                                             dtype=numpy.int32)
            self._reset_output()
            wg = self.kernels[self.matching_kernel + ".matching"]
            size = calc_size((nkp1.size, ), (wg, ))
            evt = self.programs[self.matching_kernel].matching(
                self.queue, size, (wg, ), kpt1_gpu.data, kpt2_gpu.data,
                self.buffers["match"].data, self.buffers["cnt"].data,
                numpy.int32(self.kpsize),
                numpy.float32(par.MatchRatio * par.MatchRatio),
                numpy.int32(nkp1.size), numpy.int32(nkp2.size))
            if self.profile:
                self.events.append(("matching", evt))
            size = self.buffers["cnt"].get()[0]
            match = numpy.empty(shape=(size, 2), dtype=numpy.int32)
            if size > 0:
                cpyD2H = pyopencl.enqueue_copy(self.queue, match,
                                               self.buffers["match"].data)
            if self.profile:
                self.events.append(("copy D->H match", cpyD2H))
            if raw_results:
                result = match
            else:
                result = numpy.recarray(shape=(size, 2), dtype=self.dtype_kp)

                result[:, 0] = nkp1[match[:size, 0]]
                result[:, 1] = nkp2[match[:size, 1]]
        return result
Ejemplo n.º 11
0
Archivo: plan.py Proyecto: dnaudet/silx
    def _one_octave(self, octave):
        """
        Does all scales within an octave

        :param octave: number of the octave
        """
        prevSigma = self._init_sigma
        logger.info("Calculating octave %i" % octave)
        wgsize = (128,)  # (max(self.wgsize[octave]),) #TODO: optimize
        kpsize32 = numpy.int32(self.kpsize)
        self._reset_keypoints()
        octsize = numpy.int32(2 ** octave)
        last_start = numpy.int32(0)
        for scale in range(par.Scales + 2):
            sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0)
            logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma))

            ########################################################################
            # Calculate gaussian blur and DoG
            ########################################################################

            self._gaussian_convolution(self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave)
            prevSigma *= self.sigmaRatio
            evt = self.kernels.get_kernel("combine")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                     self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0),
                                                     self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0),
                                                     self.cl_mem["DoGs"].data, numpy.int32(scale),
                                                     *self.scales[octave])
            if self.profile:
                self.events.append(("DoG %s %s" % (octave, scale), evt))
        for scale in range(1, par.Scales + 1):
            evt = self.kernels.get_kernel("local_maxmin")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                          self.cl_mem["DoGs"].data,  # __global float* DOGS,
                                                          self.cl_mem["Kp_1"].data,  # __global keypoint* output,
                                                          numpy.int32(par.BorderDist),  # int border_dist,
                                                          numpy.float32(par.PeakThresh),  # float peak_thresh,
                                                          octsize,  # int octsize,
                                                          numpy.float32(par.EdgeThresh1),  # float EdgeThresh0,
                                                          numpy.float32(par.EdgeThresh),  # float EdgeThresh,
                                                          self.cl_mem["cnt"].data,  # __global int* counter,
                                                          kpsize32,  # int nb_keypoints,
                                                          numpy.int32(scale),  # int scale,
                                                          *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events.append(("local_maxmin %s %s" % (octave, scale), evt))
            procsize = calc_size((self.kpsize,), wgsize)
            cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
            # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory
            evt = self.kernels.get_kernel("interp_keypoint")(self.queue, procsize, wgsize,
                                                             self.cl_mem["DoGs"].data,  # __global float* DOGS,
                                                             self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                                             last_start,  # int start_keypoint,
                                                             self.cnt[0],  # int end_keypoint,
                                                             numpy.float32(par.PeakThresh),  # float peak_thresh,
                                                             numpy.float32(self._init_sigma),  # float InitSigma,
                                                             *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events += [("get cnt", cp_evt),
                                ("interp_keypoint %s %s" % (octave, scale), evt)
                                ]

            newcnt = self._compact(last_start)
            evt = self.kernels.get_kernel("compute_gradient_orientation")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                                          self.cl_mem["scale_%s" % (scale)].data,  # __global float* igray,
                                                                          self.cl_mem["tmp"].data,  # __global float *grad,
                                                                          self.cl_mem["ori"].data,  # __global float *ori,
                                                                          *self.scales[octave])  # int width,int height
            if self.profile:
                self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt))

#           Orientation assignement: 1D kernel, rather heavy kernel
            if newcnt and newcnt > last_start:  # launch kernel only if neededwgsize = (128,)

                if self.USE_CPU:
                    orientation_name = "orientation_cpu"
                    scales = self.scales[octave]
                else:
                    orientation_name = "orientation_gpu"
                    scales = list(self.scales[octave]) + \
                             [pyopencl.LocalMemory(36 * 4),
                              pyopencl.LocalMemory(128 * 4),
                              pyopencl.LocalMemory(128 * 4)]
                orientation = self.kernels.get_kernel(orientation_name)
                wg = self.kernels_max_wg_size[orientation_name]
                wgsize2 = (wg,)
                procsize = (int(newcnt * wg),)
                evt = orientation(self.queue, procsize, wgsize2,
                                  self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                  self.cl_mem["tmp"].data,  # __global float* grad,
                                  self.cl_mem["ori"].data,  # __global float* ori,
                                  self.cl_mem["cnt"].data,  # __global int* counter,
                                  octsize,  # int octsize,
                                  numpy.float32(par.OriSigma),  # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)"
                                  kpsize32,  # int max of nb_keypoints,
                                  numpy.int32(last_start),  # int keypoints_start,
                                  newcnt,  # int keypoints_end,
                                  *scales)  # int grad_width, int grad_height)
                # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above !
                evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
                newcnt = self.cnt[0]  # do not forget to update numbers of keypoints, modified above !

                for _ in range(3):
                    # up to 3 attempts
                    if self.USE_CPU or (self.LOW_END > 1):
                        logger.info("Computing descriptors with CPU optimized kernels")
                        descriptor_name = "descriptor_cpu"
                        wg = self.kernels_max_wg_size[descriptor_name][0]
                        wgsize2 = (wg,)
                        procsize2 = (int(newcnt * wg),)
                    else:
                        if self.LOW_END:
                            logger.info("Computing descriptors with older-GPU optimized kernels")
                            descriptor_name = "descriptor_gpu1"
                            wgsize2 = self.kernels_max_wg_size[descriptor_name]
                            procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
                            if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2):
                                # will fail anyway:
                                self.LOW_END += 1
                                continue
                        else:
                            logger.info("Computing descriptors with newer-GPU optimized kernels")
                            descriptor_name = "descriptor_gpu2"
                            wgsize2 = self.kernels_max_wg_size[descriptor_name]
                            procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
                            if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2):
                                # will fail anyway:
                                self.LOW_END += 1
                                continue
                    try:
                        descriptor = self.kernels.get_kernel(descriptor_name)
                        evt2 = descriptor(self.queue, procsize2, wgsize2,
                                          self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                          self.cl_mem["descriptors"].data,  # ___global unsigned char *descriptors
                                          self.cl_mem["tmp"].data,  # __global float* grad,
                                          self.cl_mem["ori"].data,  # __global float* ori,
                                          octsize,  # int octsize,
                                          numpy.int32(last_start),  # int keypoints_start,
                                          self.cl_mem["cnt"].data,  # int* keypoints_end,
                                          *self.scales[octave])  # int grad_width, int grad_height)
                        evt2.wait()
                    except pyopencl.RuntimeError as error:
                        self.LOW_END += 1
                        logger.error("Descriptor failed with %s. Switching to lower_end mode" % error)
                        continue
                    else:
                        break
                if self.profile:
                    self.events += [("%s %s %s" % (orientation_name, octave, scale), evt),
                                    ("copy cnt D->H", evt_cp),
                                    ("%s %s %s" % (descriptor_name, octave, scale), evt2)]
            evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
            last_start = self.cnt[0]
            if self.profile:
                self.events.append(("copy cnt D->H", evt_cp))

        ########################################################################
        # Rescale all images to populate all octaves
        ########################################################################
        if octave < self.octave_max - 1:
            evt = self.kernels.get_kernel("shrink")(self.queue, self.procsize[octave + 1], self.wgsize[octave + 1],
                                                    self.cl_mem["scale_%i" % (par.Scales)].data,
                                                    self.cl_mem["scale_0"].data,
                                                    numpy.int32(2), numpy.int32(2),
                                                    self.scales[octave][0], self.scales[octave][1],
                                                    *self.scales[octave + 1])
            if self.profile:
                self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt))
        results = numpy.empty((last_start, 4), dtype=numpy.float32)
        descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8)
        if last_start:
            evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data)
            evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.cl_mem["descriptors"].data)
            if self.profile:
                self.events += [("copy D->H", evt),
                                ("copy D->H", evt2)]
        return results, descriptors
Ejemplo n.º 12
0
    def _one_octave(self, octave):
        """
        Does all scales within an octave

        :param octave: number of the octave
        """
        prevSigma = self._init_sigma
        logger.info("Calculating octave %i" % octave)
        wgsize = (128, )  # (max(self.wgsize[octave]),) #TODO: optimize
        kpsize32 = numpy.int32(self.kpsize)
        self._reset_keypoints()
        octsize = numpy.int32(2**octave)
        last_start = numpy.int32(0)
        for scale in range(par.Scales + 2):
            sigma = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0)
            logger.info("Octave %i scale %s blur with sigma %s" %
                        (octave, scale, sigma))

            ########################################################################
            # Calculate gaussian blur and DoG
            ########################################################################

            self._gaussian_convolution(self.buffers[scale],
                                       self.buffers[scale + 1], sigma, octave)
            prevSigma *= self.sigmaRatio
            evt = self.programs["algebra"].combine(
                self.queue, self.procsize[octave],
                self.wgsize[octave], self.buffers[scale + 1].data,
                numpy.float32(-1.0), self.buffers[scale].data,
                numpy.float32(+1.0), self.buffers["DoGs"].data,
                numpy.int32(scale), *self.scales[octave])
            if self.profile:
                self.events.append(("DoG %s %s" % (octave, scale), evt))
        for scale in range(1, par.Scales + 1):
            evt = self.programs["image"].local_maxmin(
                self.queue,
                self.procsize[octave],
                self.wgsize[octave],
                self.buffers["DoGs"].data,  # __global float* DOGS,
                self.buffers["Kp_1"].data,  # __global keypoint* output,
                numpy.int32(par.BorderDist),  # int border_dist,
                numpy.float32(par.PeakThresh),  # float peak_thresh,
                octsize,  # int octsize,
                numpy.float32(par.EdgeThresh1),  # float EdgeThresh0,
                numpy.float32(par.EdgeThresh),  # float EdgeThresh,
                self.buffers["cnt"].data,  # __global int* counter,
                kpsize32,  # int nb_keypoints,
                numpy.int32(scale),  # int scale,
                *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events.append(
                    ("local_maxmin %s %s" % (octave, scale), evt))
            procsize = calc_size((self.kpsize, ), wgsize)
            cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt,
                                           self.buffers["cnt"].data)
            # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory
            evt = self.programs["image"].interp_keypoint(
                self.queue,
                procsize,
                wgsize,
                self.buffers["DoGs"].data,  # __global float* DOGS,
                self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
                last_start,  # int start_keypoint,
                self.cnt[0],  # int end_keypoint,
                numpy.float32(par.PeakThresh),  # float peak_thresh,
                numpy.float32(self._init_sigma),  # float InitSigma,
                *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events += [("get cnt", cp_evt),
                                ("interp_keypoint %s %s" % (octave, scale),
                                 evt)]

            newcnt = self._compact(last_start)
            evt = self.programs["image"].compute_gradient_orientation(
                self.queue,
                self.procsize[octave],
                self.wgsize[octave],
                self.buffers[scale].data,  # __global float* igray,
                self.buffers["tmp"].data,  # __global float *grad,
                self.buffers["ori"].data,  # __global float *ori,
                *self.scales[octave])  # int width,int height
            if self.profile:
                self.events.append(
                    ("compute_gradient_orientation %s %s" % (octave, scale),
                     evt))

#           Orientation assignement: 1D kernel, rather heavy kernel
            if newcnt and newcnt > last_start:  # launch kernel only if neededwgsize = (128,)

                if self.USE_CPU:
                    file_to_use = "orientation_cpu"
#                    logger.info("Computing orientation with CPU-optimized kernels")
                else:
                    file_to_use = "orientation_gpu"

                wgsize2 = self.kernels[file_to_use],
                procsize = int(newcnt * wgsize2[0]),
                evt = self.programs[file_to_use].orientation_assignment(
                    self.queue,
                    procsize,
                    wgsize2,
                    self.buffers["Kp_1"].data,  # __global keypoint* keypoints,
                    self.buffers["tmp"].data,  # __global float* grad,
                    self.buffers["ori"].data,  # __global float* ori,
                    self.buffers["cnt"].data,  # __global int* counter,
                    octsize,  # int octsize,
                    numpy.float32(
                        par.OriSigma
                    ),  # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)"
                    kpsize32,  # int max of nb_keypoints,
                    numpy.int32(last_start),  # int keypoints_start,
                    newcnt,  # int keypoints_end,
                    *self.scales[octave])  # int grad_width, int grad_height)
                # newcnt = self.buffers["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above !
                evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt,
                                               self.buffers["cnt"].data)
                newcnt = self.cnt[
                    0]  # do not forget to update numbers of keypoints, modified above !

                for i_not_used in range(3):
                    # up to 3 attempts
                    if (not self.USE_CPU) and (self.LOW_END
                                               == 0) and ("keypoints_gpu2"
                                                          in self.kernels):
                        file_to_use = "keypoints_gpu2"
                        logger.info(
                            "Computing descriptors with newer-GPU optimized kernels"
                        )
                        wgsize2 = self.kernels[file_to_use]
                        procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1],
                                     wgsize2[2])
                    elif (not self.USE_CPU) and (self.LOW_END
                                                 == 1) and ("keypoints_gpu1"
                                                            in self.kernels):
                        file_to_use = "keypoints_gpu1"
                        logger.info(
                            "Computing descriptors with older-GPU optimized kernels"
                        )
                        wgsize2 = self.kernels[file_to_use]
                        procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1],
                                     wgsize2[2])
                    else:
                        # self.USE_CPU or self.LOW_END == 2, fail-safe fall-back
                        file_to_use = "keypoints_cpu"
                        logger.info(
                            "Computing descriptors with CPU optimized kernels")
                        wgsize2 = self.kernels[file_to_use],
                        procsize2 = (int(newcnt * wgsize2[0]), )
                    try:
                        evt2 = self.programs[file_to_use].descriptor(
                            self.queue,
                            procsize2,
                            wgsize2,
                            self.buffers["Kp_1"].
                            data,  # __global keypoint* keypoints,
                            self.buffers["descriptors"].
                            data,  # ___global unsigned char *descriptors
                            self.buffers["tmp"].data,  # __global float* grad,
                            self.buffers["ori"].data,  # __global float* ori,
                            octsize,  # int octsize,
                            numpy.int32(last_start),  # int keypoints_start,
                            self.buffers["cnt"].data,  # int* keypoints_end,
                            *self.scales[octave]
                        )  # int grad_width, int grad_height)
                    except pyopencl.RuntimeError as error:
                        self.LOW_END += 1
                        logger.error(
                            "Descriptor failed with %s. Switching to lower_end mode"
                            % error)
                        continue
                    else:
                        break
                if self.profile:
                    self.events += [
                        ("orientation_assignment %s %s" % (octave, scale),
                         evt), ("copy cnt D->H", evt_cp),
                        ("descriptors %s %s" % (octave, scale), evt2)
                    ]

            evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt,
                                           self.buffers["cnt"].data)
            last_start = self.cnt[0]
            if self.profile:
                self.events.append(("copy cnt D->H", evt_cp))

        ########################################################################
        # Rescale all images to populate all octaves
        ########################################################################
        if octave < self.octave_max - 1:
            evt = self.programs["preprocess"].shrink(
                self.queue, self.procsize[octave + 1], self.wgsize[octave + 1],
                self.buffers[par.Scales].data, self.buffers[0].data,
                numpy.int32(2), numpy.int32(2), self.scales[octave][0],
                self.scales[octave][1], *self.scales[octave + 1])
            if self.profile:
                self.events.append(
                    ("shrink %s->%s" %
                     (self.scales[octave], self.scales[octave + 1]), evt))
        results = numpy.empty((last_start, 4), dtype=numpy.float32)
        descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8)
        if last_start:
            evt = pyopencl.enqueue_copy(self.queue, results,
                                        self.buffers["Kp_1"].data)
            evt2 = pyopencl.enqueue_copy(self.queue, descriptors,
                                         self.buffers["descriptors"].data)
            if self.profile:
                self.events += [("copy D->H", evt), ("copy D->H", evt2)]
        return results, descriptors
Ejemplo n.º 13
0
    def align(self, img, shift_only=False, return_all=False, double_check=False, relative=False, orsa=False):
        """
        Align image on reference image

        :param img: numpy array containing the image to align to reference
        :param return_all: return in addition ot the image, keypoints, matching keypoints, and transformations as a dict
        :param reltive: update reference keypoints with those from current image to perform relative alignment
        :return: aligned image or all informations
        """
        logger.debug("ref_keypoints: %s" % self.ref_kp.size)
        if self.RGB:
            data = numpy.ascontiguousarray(img, numpy.uint8)
        else:
            data = numpy.ascontiguousarray(img, numpy.float32)
        with self.sem:
            cpy = pyopencl.enqueue_copy(self.queue, self.buffers["input"].data, data)
            if self.profile:
                self.events.append(("Copy H->D", cpy))
            cpy.wait()
            kp = self.sift.keypoints(self.buffers["input"])
#            print("ref %s img %s" % (self.buffers["ref_kp_gpu"].shape, kp.shape))
            logger.debug("mod image keypoints: %s" % kp.size)
            raw_matching = self.match.match(self.buffers["ref_kp_gpu"], kp, raw_results=True)
#            print(raw_matching.max(axis=0))

            matching = numpy.recarray(shape=raw_matching.shape, dtype=MatchPlan.dtype_kp)
            len_match = raw_matching.shape[0]
            if len_match == 0:
                logger.warning("No matching keypoints")
                return
            matching[:, 0] = self.ref_kp[raw_matching[:, 0]]
            matching[:, 1] = kp[raw_matching[:, 1]]

            if orsa:
                if feature:
                    matching = feature.sift_orsa(matching, self.shape, 1)
                else:
                    logger.warning("feature is not available. No ORSA filtering")

            if (len_match < 3 * 6) or (shift_only):  # 3 points per DOF
                if shift_only:
                    logger.debug("Shift Only mode: Common keypoints: %s" % len_match)
                else:
                    logger.warning("Shift Only mode: Common keypoints: %s" % len_match)
                dx = matching[:, 1].x - matching[:, 0].x
                dy = matching[:, 1].y - matching[:, 0].y
                matrix = numpy.identity(2, dtype=numpy.float32)
                offset = numpy.array([+numpy.median(dy), +numpy.median(dx)], numpy.float32)
            else:
                logger.debug("Common keypoints: %s" % len_match)

                transform_matrix = matching_correction(matching)
                offset = numpy.array([transform_matrix[5], transform_matrix[2]], dtype=numpy.float32)
                matrix = numpy.empty((2, 2), dtype=numpy.float32)
                matrix[0, 0], matrix[0, 1] = transform_matrix[4], transform_matrix[3]
                matrix[1, 0], matrix[1, 1] = transform_matrix[1], transform_matrix[0]
            if double_check and (len_match >= 3 * 6):  # and abs(matrix - numpy.identity(2)).max() > 0.1:
                logger.warning("Validating keypoints, %s,%s" % (matrix, offset))
                dx = matching[:, 1].x - matching[:, 0].x
                dy = matching[:, 1].y - matching[:, 0].y
                dangle = matching[:, 1].angle - matching[:, 0].angle
                dscale = numpy.log(matching[:, 1].scale / matching[:, 0].scale)
                distance = numpy.sqrt(dx * dx + dy * dy)
                outlayer = numpy.zeros(distance.shape, numpy.int8)
                outlayer += abs((distance - distance.mean()) / distance.std()) > 4
                outlayer += abs((dangle - dangle.mean()) / dangle.std()) > 4
                outlayer += abs((dscale - dscale.mean()) / dscale.std()) > 4
                print(outlayer)
                outlayersum = outlayer.sum()
                if outlayersum > 0 and not numpy.isinf(outlayersum):
                    matching2 = matching[outlayer == 0]
                    transform_matrix = matching_correction(matching2)
                    offset = numpy.array([transform_matrix[5], transform_matrix[2]], dtype=numpy.float32)
                    matrix = numpy.empty((2, 2), dtype=numpy.float32)
                    matrix[0, 0], matrix[0, 1] = transform_matrix[4], transform_matrix[3]
                    matrix[1, 0], matrix[1, 1] = transform_matrix[1], transform_matrix[0]
            if relative:  # update stable part to perform a relative alignment
                self.ref_kp = kp
                if self.ROI is not None:
                    kpx = numpy.round(self.ref_kp.x).astype(numpy.int32)
                    kpy = numpy.round(self.ref_kp.y).astype(numpy.int32)
                    masked = self.ROI[(kpy, kpx)].astype(bool)
                    logger.warning("Reducing keypoint list from %i to %i because of the ROI" % (self.ref_kp.size, masked.sum()))
                    self.ref_kp = self.ref_kp[masked]
                self.buffers["ref_kp_gpu"] = pyopencl.array.to_device(self.match.queue, self.ref_kp)
                transfo = numpy.zeros((3, 3), dtype=numpy.float64)
                transfo[:2, :2] = matrix
                transfo[0, 2] = offset[0]
                transfo[1, 2] = offset[1]
                transfo[2, 2] = 1
                if self.relative_transfo is None:
                    self.relative_transfo = transfo
                else:
                    self.relative_transfo = numpy.dot(transfo, self.relative_transfo)
                matrix = numpy.ascontiguousarray(self.relative_transfo[:2, :2], dtype=numpy.float32)
                offset = numpy.ascontiguousarray(self.relative_transfo[:2, 2], dtype=numpy.float32)
#                print(self.relative_transfo)
            cpy1 = pyopencl.enqueue_copy(self.queue, self.buffers["matrix"].data, matrix)
            cpy2 = pyopencl.enqueue_copy(self.queue, self.buffers["offset"].data, offset)
            if self.profile:
                self.events += [("Copy matrix", cpy1), ("Copy offset", cpy2)]

            if self.RGB:
                shape = (4, self.shape[1], self.shape[0])
                transform = self.program.transform_RGB
            else:
                shape = self.shape[1], self.shape[0]
                transform = self.program.transform
            ev = transform(self.queue, calc_size(shape, self.wg), self.wg,
                           self.buffers["input"].data,
                           self.buffers["output"].data,
                           self.buffers["matrix"].data,
                           self.buffers["offset"].data,
                           numpy.int32(self.shape[1]),
                           numpy.int32(self.shape[0]),
                           numpy.int32(self.outshape[1]),
                           numpy.int32(self.outshape[0]),
                           self.sift.buffers["min"].get()[0],
                           numpy.int32(1))
            if self.profile:
                self.events += [("transform", ev)]
            result = self.buffers["output"].get()

#        print (self.buffers["offset"])
        if return_all:
#            corr = numpy.dot(matrix, numpy.vstack((matching[:, 1].y, matching[:, 1].x))).T - \
#                   offset.T - numpy.vstack((matching[:, 0].y, matching[:, 0].x)).T
            corr = numpy.dot(matrix, numpy.vstack((matching[:, 0].y, matching[:, 0].x))).T + offset.T - numpy.vstack((matching[:, 1].y, matching[:, 1].x)).T
            rms = numpy.sqrt((corr * corr).sum(axis=-1).mean())

            # Todo: calculate the RMS of deplacement and return it:
            return {"result": result, "keypoint": kp, "matching": matching, "offset": offset, "matrix": matrix, "rms": rms}
        return result