Esempio n. 1
0
File: plan.py Progetto: kunyiC/silx
    def _init_gaussian(self, sigma):
        """Create a buffer of the right size according to the width of the gaussian ...


        :param  sigma: width of the gaussian, the length of the function will be 8*sigma + 1

        Same calculation done on CPU
        x = numpy.arange(size) - (size - 1.0) / 2.0
        gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32)
        gaussian /= gaussian.sum(dtype=numpy.float32)
        """
        pyopencl.enqueue_barrier(self.queue).wait()
        name = "gaussian_%s" % sigma
        size = kernel_size(sigma, True)
        wg_size = nextpower(size)

        logger.info("Allocating %s float for blur sigma: %s. wg=%s max_wg=%s",
                    size, sigma, wg_size, self.block_size)
        wg1 = self.kernels_wg["gaussian"]
        if wg1 >= wg_size:
            gaussian_gpu = pyopencl.array.empty(self.queue,
                                                size,
                                                dtype=numpy.float32)
            pyopencl.enqueue_barrier(self.queue).wait()
            kernel = self.kernels.get_kernel("gaussian")
            shm1 = pyopencl.LocalMemory(4 * wg_size)
            shm2 = pyopencl.LocalMemory(4 * wg_size)
            evt = kernel(
                self.queue,
                (wg_size, ),
                (wg_size, ),
                gaussian_gpu.data,
                numpy.float32(sigma),  # const        float     sigma,
                numpy.int32(size),  # const        int     SIZE
                shm1,
                shm2)  # some shared memory
            pyopencl.enqueue_barrier(self.queue).wait()
            if self.profile:
                self.events.append(("gaussian %s" % sigma, evt))
        else:
            logger.info(
                "Workgroup size error: gaussian wg: %s < max_work_group_size: %s",
                wg1, self.block_size)
            # common bug on OSX when running on CPU
            x = numpy.arange(size) - (size - 1.0) / 2.0
            gaus = numpy.exp(-(x / sigma)**2 / 2.0).astype(numpy.float32)
            gaus /= gaus.sum(dtype=numpy.float32)
            gaussian_gpu = pyopencl.array.to_device(self.queue, gaus)

        self.cl_mem[name] = gaussian_gpu
        return gaussian_gpu
    def test_orientation(self):
        '''
        #tests keypoints orientation assignment kernel
        '''
        if self.abort:
            return
        # orientation_setup :
        keypoints, nb_keypoints, updated_nb_keypoints, grad, ori, octsize = orientation_setup(
        )
        keypoints, compact_cnt = my_compact(numpy.copy(keypoints),
                                            nb_keypoints)
        updated_nb_keypoints = compact_cnt
        logger.info("Number of keypoints before orientation assignment : %s",
                    updated_nb_keypoints)

        # Prepare kernel call
        wg = self.wg_orient
        kernel = self.program_orient.all_kernels()[0]
        max_wg = kernel_workgroup_size(self.program_orient, kernel)
        if max_wg < wg[0]:
            logger.warning(
                "test_orientation: Skipping test of WG=%s when maximum for this kernel is %s ",
                wg, max_wg)
            return
        shape = keypoints.shape[0] * wg[
            0],  # shape = calc_size(keypoints.shape, self.wg)
        gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints)
        actual_nb_keypoints = numpy.int32(updated_nb_keypoints)
        gpu_grad = pyopencl.array.to_device(self.queue, grad)
        gpu_ori = pyopencl.array.to_device(self.queue, ori)
        orisigma = numpy.float32(1.5)  # SIFT
        grad_height, grad_width = numpy.int32(grad.shape)
        keypoints_start = numpy.int32(0)
        keypoints_end = numpy.int32(actual_nb_keypoints)
        counter = pyopencl.array.to_device(
            self.queue, keypoints_end)  # actual_nb_keypoints)
        kargs = [
            gpu_keypoints.data, gpu_grad.data, gpu_ori.data, counter.data,
            octsize, orisigma, nb_keypoints, keypoints_start, keypoints_end,
            grad_width, grad_height
        ]
        if not self.USE_CPU:
            kargs += [
                pyopencl.LocalMemory(36 * 4),
                pyopencl.LocalMemory(128 * 4),
                pyopencl.LocalMemory(128 * 4)
            ]

        # Call the kernel
        t0 = time.time()
        k1 = kernel(self.queue, shape, wg, *kargs)
        res = gpu_keypoints.get()
        cnt = counter.get()
        t1 = time.time()

        # Reference Python implemenattion
        ref, updated_nb_keypoints = my_orientation(keypoints, nb_keypoints,
                                                   keypoints_start,
                                                   keypoints_end, grad, ori,
                                                   octsize, orisigma)
        t2 = time.time()

        # sort to compare added keypoints
        upbound = min(cnt, updated_nb_keypoints)
        d1, d2, d3, d4 = keypoints_compare(ref[0:upbound], res[0:upbound])
        self.assertLess(d1, 1e-4, "delta_cols=%s" % (d1))
        self.assertLess(d2, 1e-4, "delta_rows=%s" % (d2))
        self.assertLess(d3, 1e-4, "delta_sigma=%s" % (d3))
        self.assertLess(d4, 1e-1, "delta_angle=%s" %
                        (d4))  # orientation has a poor precision
        logger.info("delta_cols=%s" % d1)
        logger.info("delta_rows=%s" % d2)
        logger.info("delta_sigma=%s" % d3)
        logger.info("delta_angle=%s" % d4)

        if self.PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Orientation assignment took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Esempio n. 3
0
    def _one_octave(self, octave):
        """
        Does all scales within an octave

        :param octave: number of the octave
        """
        prevSigma = self._init_sigma
        logger.info("Calculating octave %i" % octave)
        wgsize = (128,)  # (max(self.wgsize[octave]),) #TODO: optimize
        kpsize32 = numpy.int32(self.kpsize)
        self._reset_keypoints()
        octsize = numpy.int32(2 ** octave)
        last_start = numpy.int32(0)
        for scale in range(par.Scales + 2):
            sigma = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0)
            logger.info("Octave %i scale %s blur with sigma %s" % (octave, scale, sigma))

            ########################################################################
            # Calculate gaussian blur and DoG
            ########################################################################

            self._gaussian_convolution(self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave)
            prevSigma *= self.sigmaRatio
            evt = self.kernels.get_kernel("combine")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                     self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0),
                                                     self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0),
                                                     self.cl_mem["DoGs"].data, numpy.int32(scale),
                                                     *self.scales[octave])
            if self.profile:
                self.events.append(("DoG %s %s" % (octave, scale), evt))
        for scale in range(1, par.Scales + 1):
            evt = self.kernels.get_kernel("local_maxmin")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                          self.cl_mem["DoGs"].data,  # __global float* DOGS,
                                                          self.cl_mem["Kp_1"].data,  # __global keypoint* output,
                                                          numpy.int32(par.BorderDist),  # int border_dist,
                                                          numpy.float32(par.PeakThresh),  # float peak_thresh,
                                                          octsize,  # int octsize,
                                                          numpy.float32(par.EdgeThresh1),  # float EdgeThresh0,
                                                          numpy.float32(par.EdgeThresh),  # float EdgeThresh,
                                                          self.cl_mem["cnt"].data,  # __global int* counter,
                                                          kpsize32,  # int nb_keypoints,
                                                          numpy.int32(scale),  # int scale,
                                                          *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events.append(("local_maxmin %s %s" % (octave, scale), evt))
            procsize = calc_size((self.kpsize,), wgsize)
            cp_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
            # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory
            evt = self.kernels.get_kernel("interp_keypoint")(self.queue, procsize, wgsize,
                                                             self.cl_mem["DoGs"].data,  # __global float* DOGS,
                                                             self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                                             last_start,  # int start_keypoint,
                                                             self.cnt[0],  # int end_keypoint,
                                                             numpy.float32(par.PeakThresh),  # float peak_thresh,
                                                             numpy.float32(self._init_sigma),  # float InitSigma,
                                                             *self.scales[octave])  # int width, int height)
            if self.profile:
                self.events += [("get cnt", cp_evt),
                                ("interp_keypoint %s %s" % (octave, scale), evt)
                                ]

            newcnt = self._compact(last_start)
            evt = self.kernels.get_kernel("compute_gradient_orientation")(self.queue, self.procsize[octave], self.wgsize[octave],
                                                                          self.cl_mem["scale_%s" % (scale)].data,  # __global float* igray,
                                                                          self.cl_mem["tmp"].data,  # __global float *grad,
                                                                          self.cl_mem["ori"].data,  # __global float *ori,
                                                                          *self.scales[octave])  # int width,int height
            if self.profile:
                self.events.append(("compute_gradient_orientation %s %s" % (octave, scale), evt))

#           Orientation assignement: 1D kernel, rather heavy kernel
            if newcnt and newcnt > last_start:  # launch kernel only if neededwgsize = (128,)

                if self.USE_CPU:
                    orientation_name = "orientation_cpu"
                    scales = self.scales[octave]
                else:
                    orientation_name = "orientation_gpu"
                    scales = list(self.scales[octave]) + \
                             [pyopencl.LocalMemory(36 * 4),
                              pyopencl.LocalMemory(128 * 4),
                              pyopencl.LocalMemory(128 * 4)]
                orientation = self.kernels.get_kernel(orientation_name)
                wg = self.kernels_max_wg_size[orientation_name]
                wgsize2 = (wg,)
                procsize = (int(newcnt * wg),)
                evt = orientation(self.queue, procsize, wgsize2,
                                  self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                  self.cl_mem["tmp"].data,  # __global float* grad,
                                  self.cl_mem["ori"].data,  # __global float* ori,
                                  self.cl_mem["cnt"].data,  # __global int* counter,
                                  octsize,  # int octsize,
                                  numpy.float32(par.OriSigma),  # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)"
                                  kpsize32,  # int max of nb_keypoints,
                                  numpy.int32(last_start),  # int keypoints_start,
                                  newcnt,  # int keypoints_end,
                                  *scales)  # int grad_width, int grad_height)
                # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above !
                evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
                newcnt = self.cnt[0]  # do not forget to update numbers of keypoints, modified above !

                for _ in range(3):
                    # up to 3 attempts
                    if self.USE_CPU or (self.LOW_END > 1):
                        logger.info("Computing descriptors with CPU optimized kernels")
                        descriptor_name = "descriptor_cpu"
                        wg = self.kernels_max_wg_size[descriptor_name][0]
                        wgsize2 = (wg,)
                        procsize2 = (int(newcnt * wg),)
                    else:
                        if self.LOW_END:
                            logger.info("Computing descriptors with older-GPU optimized kernels")
                            descriptor_name = "descriptor_gpu1"
                            wgsize2 = self.kernels_max_wg_size[descriptor_name]
                            procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
#                             if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2):
#                                 # will fail anyway:
#                                 self.LOW_END += 1
#                                 continue
                        else:
                            logger.info("Computing descriptors with newer-GPU optimized kernels")
                            descriptor_name = "descriptor_gpu2"
                            wgsize2 = self.kernels_max_wg_size[descriptor_name]
                            procsize2 = (int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2])
#                             if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2):
#                                 # will fail anyway:
#                                 self.LOW_END += 1
#                                 continue
                    try:
                        descriptor = self.kernels.get_kernel(descriptor_name)
                        evt2 = descriptor(self.queue, procsize2, wgsize2,
                                          self.cl_mem["Kp_1"].data,  # __global keypoint* keypoints,
                                          self.cl_mem["descriptors"].data,  # ___global unsigned char *descriptors
                                          self.cl_mem["tmp"].data,  # __global float* grad,
                                          self.cl_mem["ori"].data,  # __global float* ori,
                                          octsize,  # int octsize,
                                          numpy.int32(last_start),  # int keypoints_start,
                                          self.cl_mem["cnt"].data,  # int* keypoints_end,
                                          *self.scales[octave])  # int grad_width, int grad_height)
                        evt2.wait()
                    except (pyopencl.RuntimeError, pyopencl._cl.LogicError) as error:
                        self.LOW_END += 1
                        logger.error("Descriptor failed with %s. Switching to lower_end mode" % error)
                        continue
                    else:
                        break
                if self.profile:
                    self.events += [("%s %s %s" % (orientation_name, octave, scale), evt),
                                    ("copy cnt D->H", evt_cp),
                                    ("%s %s %s" % (descriptor_name, octave, scale), evt2)]
            evt_cp = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data)
            last_start = self.cnt[0]
            if self.profile:
                self.events.append(("copy cnt D->H", evt_cp))

        ########################################################################
        # Rescale all images to populate all octaves
        ########################################################################
        if octave < self.octave_max - 1:
            evt = self.kernels.get_kernel("shrink")(self.queue, self.procsize[octave + 1], self.wgsize[octave + 1],
                                                    self.cl_mem["scale_%i" % (par.Scales)].data,
                                                    self.cl_mem["scale_0"].data,
                                                    numpy.int32(2), numpy.int32(2),
                                                    self.scales[octave][0], self.scales[octave][1],
                                                    *self.scales[octave + 1])
            if self.profile:
                self.events.append(("shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt))
        results = numpy.empty((last_start, 4), dtype=numpy.float32)
        descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8)
        if last_start:
            evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data)
            evt2 = pyopencl.enqueue_copy(self.queue, descriptors, self.cl_mem["descriptors"].data)
            if self.profile:
                self.events += [("copy D->H", evt),
                                ("copy D->H", evt2)]
        return results, descriptors
Esempio n. 4
0
    def keypoints(self, image, mask=None):
        """Calculates the keypoints of the image

        TODO: use a temporary list with events and use a single test at the end

        :param image: ndimage of 2D (or 3D if RGB)
        :param mask: TODO: implement a mask for sieving out the keypoints
        :return: vector of keypoint (1D numpy array)
        """
        # self.reset_timer()
        with self.sem:
            total_size = 0
            keypoints = []
            descriptors = []
            assert image.shape[:2] == self.shape
            assert image.dtype in [self.dtype, numpy.float32]
            # old versions of pyopencl do not check for data contiguity
            if not(isinstance(image, pyopencl.array.Array)) and not(image.flags["C_CONTIGUOUS"]):
                image = numpy.ascontiguousarray(image)
            t0 = time.time()

            if image.dtype == numpy.float32:
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif self.dtype == numpy.float64:
                # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64)
                # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32
                #       and uncomment it if it proves faster (dubious, because of data transfer bottleneck)
                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["scale_0"].data, image.astype(numpy.float32))
                if self.profile:
                    self.events.append(("copy H->D", evt))
            elif (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB):
                if isinstance(image, pyopencl.array.Array):
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image.data)
                else:
                    evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))

                evt = self.kernels.get_kernel("rgb_to_float")(self.queue, self.procsize[0], self.wgsize[0],
                                                       self.cl_mem["raw"].data, self.cl_mem["scale_0"].data,
                                                       *self.scales[0])
                if self.profile:
                    self.events.append(("RGB -> float", evt))

            elif self.dtype in self.converter:
                program = self.kernels.get_kernel(self.converter[self.dtype])
                evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image)
                if self.profile:
                    self.events.append(("copy H->D", evt))
                evt = program(self.queue, self.procsize[0], self.wgsize[0],
                              self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0])
                if self.profile:
                    self.events.append(("convert -> float", evt))
            else:
                raise RuntimeError("invalid input format error (%s)" % (str(self.dtype)))

            wg1 = self.kernels_wg["max_min_global_stage1"]
            wg2 = self.kernels_wg["max_min_global_stage2"]
            if min(wg1, wg2) < self.red_size:
                # common bug on OSX when running on CPU
                logger.info("Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s",
                            wg1, wg2, self.block_size, self.red_size)
                kernel = self.kernels.get_kernel("max_min_vec16")
                k = kernel(self.queue, (1,), (1,),
                               self.cl_mem["scale_0"].data,
                               numpy.int32(self.shape[0] * self.shape[1]),
                               self.cl_mem["max"].data,
                               self.cl_mem["min"].data)
                if self.profile:
                    self.events.append(("max_min_serial", k))
                # python implementation:
                # buffer_ = self.cl_mem["scale_0"].get()
                # self.cl_mem["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32))
                # self.cl_mem["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32))
            else:
                kernel1 = self.kernels.get_kernel("max_min_global_stage1")
                kernel2 = self.kernels.get_kernel("max_min_global_stage2")
                # logger.debug("self.red_size: %s", self.red_size)
                shm = pyopencl.LocalMemory(self.red_size * 2 * 4)
                k1 = kernel1(self.queue, (self.red_size * self.red_size,), (self.red_size,),
                             self.cl_mem["scale_0"].data,
                             self.cl_mem["max_min"].data,
                             numpy.int32(self.shape[0] * self.shape[1]),
                             shm)
                k2 = kernel2(self.queue, (self.red_size,), (self.red_size,),
                             self.cl_mem["max_min"].data,
                             self.cl_mem["max"].data,
                             self.cl_mem["min"].data,
                             shm)

                if self.profile:
                    self.events.append(("max_min_stage1", k1))
                    self.events.append(("max_min_stage2", k2))

            evt = self.kernels.get_kernel("normalizes")(self.queue, self.procsize[0], self.wgsize[0],
                                                        self.cl_mem["scale_0"].data,
                                                        self.cl_mem["min"].data,
                                                        self.cl_mem["max"].data,
                                                        self.cl_mem["255"].data,
                                                        *self.scales[0])
            if self.profile:
                self.events.append(("normalize", evt))

            curSigma = 1.0 if par.DoubleImSize else 0.5
            octave = 0
            if self._init_sigma > curSigma:
                logger.debug("Bluring image to achieve std: %f", self._init_sigma)
                sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2)
                self._gaussian_convolution(self.cl_mem["scale_0"], self.cl_mem["scale_0"], sigma, 0)

            for octave in range(self.octave_max):
                kp, descriptor = self._one_octave(octave)
                logger.info("in octave %i found %i kp" % (octave, kp.shape[0]))

                if len(kp):
                    # sieve out coordinates with NaNs
                    mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1))))
                    keypoints.append(kp[mask])
                    descriptors.append(descriptor[mask])
                    total_size += len(mask[0])

            ########################################################################
            # Merge keypoints in central memory
            ########################################################################
            output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp)
            last = 0
            for ds, desc in zip(keypoints, descriptors):
                l = ds.shape[0]
                if l > 0:
                    output[last:last + l].x = ds[:, 0]
                    output[last:last + l].y = ds[:, 1]
                    output[last:last + l].scale = ds[:, 2]
                    output[last:last + l].angle = ds[:, 3]
                    output[last:last + l].desc = desc
                    last += l
            logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0)))
        return output