예제 #1
0
    def _calc_workgroups(self):
        """First try to guess the best workgroup size, then calculate all global worksize

        Nota:
        The workgroup size is limited by the device, some devices report wrong size.
        The workgroup size is limited to the 2**n below then image size (hence changes with octaves)
        The second dimension of the wg size should be large, the first small: i.e. (1,64)
        The processing size should be a multiple of  workgroup size.
        """
        device = self.ctx.devices[0]
        max_work_item_sizes = device.max_work_item_sizes
        if self.max_workgroup_size:
            self.max_workgroup_size = min(max_work_item_sizes[0],
                                          self.max_workgroup_size)
        else:
            self.max_workgroup_size = max_work_item_sizes[0]
        # MacOSX driver on CPU usually reports bad workgroup size: this is addressed in ocl
        self.max_workgroup_size = min(
            self.max_workgroup_size, ocl.platforms[self.device[0]].devices[
                self.device[1]].max_work_group_size)

        self.kernels = {}
        for k, v in self.__class__.kernels.items():
            if isinstance(v, int):
                self.kernels[k] = min(v, self.max_workgroup_size)
            else:  # probably a list
                prod = 1
                for i in v:
                    prod *= i
                if prod <= self.max_workgroup_size:
                    self.kernels[k] = v
                # else it is not possible to run this kernel.
                # If the kernel is not present in the dict, it should not be used.

        wg_float = min(self.max_workgroup_size,
                       numpy.sqrt(self.shape[0] * self.shape[1]))
        self.red_size = nextpower(wg_float)

        # we recalculate the shapes ...
        shape = self.shape
        min_size = 2 * par.BorderDist + 2
        while min(shape) > min_size:
            wg = (min(nextpower(shape[-1]), self.max_workgroup_size), 1)
            self.wgsize.append(wg)
            self.procsize.append(calc_size(shape[-1::-1], wg))
            shape = tuple(i // 2 for i in shape)
예제 #2
0
    def _calc_workgroups(self):
        """First try to guess the best workgroup size, then calculate all global worksize

        Nota:
        The workgroup size is limited by the device, some devices report wrong size.
        The workgroup size is limited to the 2**n below then image size (hence changes with octaves)
        The second dimension of the wg size should be large, the first small: i.e. (1,64)
        The processing size should be a multiple of  workgroup size.
        """
        device = self.ctx.devices[0]
        max_work_item_sizes = device.max_work_item_sizes
        if self.max_workgroup_size:
            self.max_workgroup_size = min(max_work_item_sizes[0], self.max_workgroup_size)
        else:
            self.max_workgroup_size = max_work_item_sizes[0]
        # MacOSX driver on CPU usually reports bad workgroup size: this is addressed in ocl
        self.max_workgroup_size = min(self.max_workgroup_size,
                                      ocl.platforms[self.device[0]].devices[self.device[1]].max_work_group_size)

        self.kernels = {}
        for k, v in self.__class__.kernels.items():
            if isinstance(v, int):
                self.kernels[k] = min(v, self.max_workgroup_size)
            else:  # probably a list
                prod = 1
                for i in v:
                    prod *= i
                if prod <= self.max_workgroup_size:
                    self.kernels[k] = v
                # else it is not possible to run this kernel.
                # If the kernel is not present in the dict, it should not be used.

        wg_float = min(self.max_workgroup_size, numpy.sqrt(self.shape[0] * self.shape[1]))
        self.red_size = nextpower(wg_float)

        # we recalculate the shapes ...
        shape = self.shape
        min_size = 2 * par.BorderDist + 2
        while min(shape) > min_size:
            wg = (min(nextpower(shape[-1]), self.max_workgroup_size), 1)
            self.wgsize.append(wg)
            self.procsize.append(calc_size(shape[-1::-1], wg))
            shape = tuple(i // 2 for i in shape)
예제 #3
0
    def _calc_memory(self):
        """
        Estimates the memory footprint of all buffer to ensure it fits on the device
        """
        # Just the context + kernel takes about 75MB on the GPU
        self.memory = 75 * 2**20
        size_of_float = numpy.dtype(numpy.float32).itemsize
        size_of_input = numpy.dtype(self.dtype).itemsize
        # raw images:
        size = self.shape[0] * self.shape[1]
        self.memory += size * size_of_input  # initial_image (no raw_float)
        if self.RGB:
            self.memory += 2 * size * (size_of_input
                                       )  # one of three was already counted
        nr_blur = par.Scales + 3  # 3 blurs and 2 tmp
        nr_dogs = par.Scales + 2
        self.memory += size * (nr_blur + nr_dogs) * size_of_float

        self.kpsize = int(
            self.shape[0] * self.shape[1] // self.PIX_PER_KP
        )  # Is the number of kp independant of the octave ? int64 causes problems with pyopencl
        self.memory += self.kpsize * size_of_float * 4 * 2  # those are array of float4 to register keypoints, we need two of them
        self.memory += self.kpsize * 128  # stores the descriptors: 128 unsigned chars
        self.memory += 4  # keypoint index Counter
        wg_float = min(self.max_workgroup_size,
                       numpy.sqrt(self.shape[0] * self.shape[1]))
        self.red_size = nextpower(wg_float)
        self.memory += 4 * 2 * self.red_size  # temporary storage for reduction

        ########################################################################
        # Calculate space for gaussian kernels
        ########################################################################
        curSigma = 1.0 if par.DoubleImSize else 0.5
        if self._init_sigma > curSigma:
            sigma = math.sqrt(self._init_sigma**2 - curSigma**2)
            size = kernel_size(sigma, True)
            logger.debug("pre-Allocating %s float for init blur" % size)
            self.memory += size * size_of_float
        prevSigma = self._init_sigma
        for i in range(par.Scales + 2):
            increase = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0)
            size = kernel_size(increase, True)
            logger.debug("pre-Allocating %s float for blur sigma: %s" %
                         (size, increase))
            self.memory += size * size_of_float
            prevSigma *= self.sigmaRatio
예제 #4
0
    def _init_gaussian(self, sigma):
        """Create a buffer of the right size according to the width of the gaussian ...


        :param  sigma: width of the gaussian, the length of the function will be 8*sigma + 1

        Same calculation done on CPU
        x = numpy.arange(size) - (size - 1.0) / 2.0
        gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32)
        gaussian /= gaussian.sum(dtype=numpy.float32)
        """
        name = "gaussian_%s" % sigma
        size = kernel_size(sigma, True)
        wg_size = nextpower(size)

        logger.info("Allocating %s float for blur sigma: %s. wg=%s max_wg=%s",
                    size, sigma, wg_size, self.max_workgroup_size)
        wg1 = self.kernels["gaussian.gaussian"]
        if wg1 >= wg_size:
            gaussian_gpu = pyopencl.array.empty(self.queue,
                                                size,
                                                dtype=numpy.float32)
            evt = self.programs["gaussian"].gaussian(
                self.queue,
                (wg_size, ),
                (wg_size, ),
                gaussian_gpu.data,  # __global     float     *data,
                numpy.float32(sigma),  # const        float     sigma,
                numpy.int32(size))  # const        int     SIZE
            if self.profile:
                self.events.append(("gaussian %s" % sigma, evt))
        else:
            logger.info(
                "Workgroup size error: gaussian wg: %s < max_work_group_size: %s",
                wg1, self.max_workgroup_size)
            #common bug on OSX when running on CPU
            x = numpy.arange(size) - (size - 1.0) / 2.0
            gaus = numpy.exp(-(x / sigma)**2 / 2.0).astype(numpy.float32)
            gaus /= gaus.sum(dtype=numpy.float32)
            gaussian_gpu = pyopencl.array.to_device(self.queue, gaus)

        self.buffers[name] = gaussian_gpu
        return gaussian_gpu
예제 #5
0
    def _calc_memory(self):
        """
        Estimates the memory footprint of all buffer to ensure it fits on the device
        """
        # Just the context + kernel takes about 75MB on the GPU
        self.memory = 75 * 2 ** 20
        size_of_float = numpy.dtype(numpy.float32).itemsize
        size_of_input = numpy.dtype(self.dtype).itemsize
        # raw images:
        size = self.shape[0] * self.shape[1]
        self.memory += size * size_of_input  # initial_image (no raw_float)
        if self.RGB:
            self.memory += 2 * size * (size_of_input)  # one of three was already counted
        nr_blur = par.Scales + 3  # 3 blurs and 2 tmp
        nr_dogs = par.Scales + 2
        self.memory += size * (nr_blur + nr_dogs) * size_of_float

        self.kpsize = int(self.shape[0] * self.shape[1] // self.PIX_PER_KP)  # Is the number of kp independant of the octave ? int64 causes problems with pyopencl
        self.memory += self.kpsize * size_of_float * 4 * 2  # those are array of float4 to register keypoints, we need two of them
        self.memory += self.kpsize * 128  # stores the descriptors: 128 unsigned chars
        self.memory += 4  # keypoint index Counter
        wg_float = min(self.max_workgroup_size, numpy.sqrt(self.shape[0] * self.shape[1]))
        self.red_size = nextpower(wg_float)
        self.memory += 4 * 2 * self.red_size  # temporary storage for reduction

        ########################################################################
        # Calculate space for gaussian kernels
        ########################################################################
        curSigma = 1.0 if par.DoubleImSize else 0.5
        if self._init_sigma > curSigma:
            sigma = math.sqrt(self._init_sigma ** 2 - curSigma ** 2)
            size = kernel_size(sigma, True)
            logger.debug("pre-Allocating %s float for init blur" % size)
            self.memory += size * size_of_float
        prevSigma = self._init_sigma
        for i in range(par.Scales + 2):
            increase = prevSigma * math.sqrt(self.sigmaRatio ** 2 - 1.0)
            size = kernel_size(increase, True)
            logger.debug("pre-Allocating %s float for blur sigma: %s" % (size, increase))
            self.memory += size * size_of_float
            prevSigma *= self.sigmaRatio
예제 #6
0
    def _init_gaussian(self, sigma):
        """Create a buffer of the right size according to the width of the gaussian ...


        :param  sigma: width of the gaussian, the length of the function will be 8*sigma + 1

        Same calculation done on CPU
        x = numpy.arange(size) - (size - 1.0) / 2.0
        gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32)
        gaussian /= gaussian.sum(dtype=numpy.float32)
        """
        name = "gaussian_%s" % sigma
        size = kernel_size(sigma, True)
        wg_size = nextpower(size)

        logger.info("Allocating %s float for blur sigma: %s. wg=%s max_wg=%s", size, sigma,wg_size, self.max_workgroup_size)
        wg1 = self.kernels["gaussian.gaussian"]
        if wg1>=wg_size:
            gaussian_gpu = pyopencl.array.empty(self.queue, size, dtype=numpy.float32)
            evt = self.programs["gaussian"].gaussian(self.queue, (wg_size,), (wg_size,),
                                                     gaussian_gpu.data,  # __global     float     *data,
                                                     numpy.float32(sigma),  # const        float     sigma,
                                                     numpy.int32(size))  # const        int     SIZE
            if self.profile:
                self.events.append(("gaussian %s" % sigma, evt))
        else:
            logger.info("Workgroup size error: gaussian wg: %s < max_work_group_size: %s",
                        wg1, self.max_workgroup_size)
            #common bug on OSX when running on CPU
            x = numpy.arange(size) - (size - 1.0) / 2.0
            gaus = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32)
            gaus /= gaus.sum(dtype=numpy.float32)
            gaussian_gpu = pyopencl.array.to_device(self.queue, gaus)

        self.buffers[name] = gaussian_gpu
        return gaussian_gpu