Example #1
0
    def test_combine(self):
        """
        tests the combine (linear combination) kernel
        """
        width = numpy.int32(157)
        height = numpy.int32(147)
        coeff1 = numpy.random.rand(1)[0].astype(numpy.float32)
        coeff2 = numpy.random.rand(1)[0].astype(numpy.float32)
        mat1 = numpy.random.rand(height, width).astype(numpy.float32)
        mat2 = numpy.random.rand(height, width).astype(numpy.float32)

        gpu_mat1 = pyopencl.array.to_device(queue, mat1)
        gpu_mat2 = pyopencl.array.to_device(queue, mat2)
        gpu_out = pyopencl.array.empty(queue,
                                       mat1.shape,
                                       dtype=numpy.float32,
                                       order="C")
        shape = calc_size((width, height), self.wg)

        t0 = time.time()
        k1 = self.program.combine(queue, shape, self.wg, gpu_mat1.data, coeff1,
                                  gpu_mat2.data, coeff2, gpu_out.data,
                                  numpy.int32(0), width, height)
        res = gpu_out.get()
        t1 = time.time()
        ref = my_combine(mat1, coeff1, mat2, coeff2)
        t2 = time.time()
        delta = abs(ref - res).max()
        logger.info("delta=%s" % delta)
        self.assert_(delta < 1e-4, "delta=%s" % (delta))
        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Linear combination took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Example #2
0
    def test_combine(self):
        """
        tests the combine (linear combination) kernel
        """
        width = numpy.int32(157)
        height = numpy.int32(147)
        coeff1 = numpy.random.rand(1)[0].astype(numpy.float32)
        coeff2 = numpy.random.rand(1)[0].astype(numpy.float32)
        mat1 = numpy.random.rand(height, width).astype(numpy.float32)
        mat2 = numpy.random.rand(height, width).astype(numpy.float32)

        gpu_mat1 = pyopencl.array.to_device(queue, mat1)
        gpu_mat2 = pyopencl.array.to_device(queue, mat2)
        gpu_out = pyopencl.array.empty(queue, mat1.shape, dtype=numpy.float32, order="C")
        shape = calc_size((width, height), self.wg)

        t0 = time.time()
        k1 = self.program.combine(queue, shape, self.wg,
                                  gpu_mat1.data, coeff1, gpu_mat2.data, coeff2,
                                  gpu_out.data, numpy.int32(0),
                                  width, height)
        res = gpu_out.get()
        t1 = time.time()
        ref = my_combine(mat1, coeff1, mat2, coeff2)
        t2 = time.time()
        delta = abs(ref - res).max()
        logger.info("delta=%s" % delta)
        self.assert_(delta < 1e-4, "delta=%s" % (delta))
        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Linear combination took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
Example #3
0
    def setUp(self):
        self.input = scipy.misc.lena().astype(numpy.float32)
        self.input = numpy.ascontiguousarray(self.input[0:507, 0:209])

        self.gpu_in = pyopencl.array.to_device(queue, self.input)
        self.gpu_tmp = pyopencl.array.empty(queue,
                                            self.input.shape,
                                            dtype=numpy.float32,
                                            order="C")
        self.gpu_out = pyopencl.array.empty(queue,
                                            self.input.shape,
                                            dtype=numpy.float32,
                                            order="C")
        kernel_path = os.path.join(
            os.path.dirname(os.path.abspath(sift.__file__)), "convolution.cl")
        kernel_src = open(kernel_path).read()
        #        compile_options = "-D NIMAGE=%i" % self.input.size
        #        logger.info("Compiling file %s with options %s" % (kernel_path, compile_options))
        #        self.program = pyopencl.Program(ctx, kernel_src).build(options=compile_options)
        self.program = pyopencl.Program(ctx, kernel_src).build()
        self.IMAGE_W = numpy.int32(self.input.shape[-1])
        self.IMAGE_H = numpy.int32(self.input.shape[0])
        self.wg = (256, 2)
        self.shape = calc_size((self.input.shape[1], self.input.shape[0]),
                               self.wg)
Example #4
0
    def test_local_maxmin(self):
        """
        tests the local maximum/minimum detection kernel
        """
        #local_maxmin_setup :
        border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, s, nb_keypoints, width, height, DOGS, g = local_maxmin_setup()
        self.s = numpy.int32(s) #1, 2, 3 ... not 4 nor 0.
        self.gpu_dogs = pyopencl.array.to_device(queue, DOGS)
        self.output = pyopencl.array.empty(queue, (nb_keypoints, 4), dtype=numpy.float32, order="C")
        self.output.fill(-1.0, queue) #memset for invalid keypoints
        self.counter = pyopencl.array.zeros(queue, (1,), dtype=numpy.int32, order="C")
        nb_keypoints = numpy.int32(nb_keypoints)
        self.shape = calc_size((DOGS.shape[1], DOGS.shape[0] * DOGS.shape[2]), self.wg) #it's a 3D vector !!

        t0 = time.time()
        k1 = self.program.local_maxmin(queue, self.shape, self.wg,
        	self.gpu_dogs.data, self.output.data,
       		border_dist, peakthresh, octsize, EdgeThresh0, EdgeThresh,
       		self.counter.data, nb_keypoints, self.s, width, height)

        res = self.output.get()
        self.keypoints1 = self.output #for further use
        self.actual_nb_keypoints = self.counter.get()[0] #for further use

        t1 = time.time()
        ref, actual_nb_keypoints2 = my_local_maxmin(DOGS, peakthresh, border_dist, octsize,
        	EdgeThresh0, EdgeThresh, nb_keypoints, self.s, width, height)
        t2 = time.time()

        #we have to sort the arrays, for peaks orders is unknown for GPU
        res_peaks = res[(res[:, 0].argsort(axis=0)), 0]
        ref_peaks = ref[(ref[:, 0].argsort(axis=0)), 0]
        res_r = res[(res[:, 1].argsort(axis=0)), 1]
        ref_r = ref[(ref[:, 1].argsort(axis=0)), 1]
        res_c = res[(res[:, 2].argsort(axis=0)), 2]
        ref_c = ref[(ref[:, 2].argsort(axis=0)), 2]
        #res_s = res[(res[:,3].argsort(axis=0)),3]
        #ref_s = ref[(ref[:,3].argsort(axis=0)),3]
        delta_peaks = abs(ref_peaks - res_peaks).max()
        delta_r = abs(ref_r - res_r).max()
        delta_c = abs(ref_c - res_c).max()

        if (PRINT_KEYPOINTS):
            print("keypoints after 2 steps of refinement: (s= %s, octsize=%s) %s" % (self.s, octsize, self.actual_nb_keypoints))
            #print("For ref: %s" %(ref_peaks[ref_peaks!=-1].shape))
            print res[0:self.actual_nb_keypoints]#[0:74]
            #print ref[0:32]

        self.assert_(delta_peaks < 1e-4, "delta_peaks=%s" % (delta_peaks))
        self.assert_(delta_r < 1e-4, "delta_r=%s" % (delta_r))
        self.assert_(delta_c < 1e-4, "delta_c=%s" % (delta_c))
        logger.info("delta_peaks=%s" % delta_peaks)
        logger.info("delta_r=%s" % delta_r)
        logger.info("delta_c=%s" % delta_c)


        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Local extrema search took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
Example #5
0
    def test_compact(self):
        """
        tests the "compact" kernel
        """

        nbkeypoints = 10000  #constant value
        keypoints = numpy.random.rand(nbkeypoints, 4).astype(numpy.float32)
        nb_ones = 0
        for i in range(0, nbkeypoints):
            if ((numpy.random.rand(1))[0] < 0.25):
                keypoints[i] = (-1, -1, i, -1)
                nb_ones += 1
            else:
                keypoints[i, 2] = i

        gpu_keypoints = pyopencl.array.to_device(queue, keypoints)
        output = pyopencl.array.empty(queue, (nbkeypoints, 4),
                                      dtype=numpy.float32,
                                      order="C")
        output.fill(-1.0, queue)
        counter = pyopencl.array.zeros(queue, (1, ),
                                       dtype=numpy.int32,
                                       order="C")
        wg = max(self.wg),
        shape = calc_size((keypoints.shape[0], ), wg)
        nbkeypoints = numpy.int32(nbkeypoints)
        startkeypoints = numpy.int32(0)
        t0 = time.time()
        k1 = self.program.compact(queue, shape, wg, gpu_keypoints.data,
                                  output.data, counter.data, startkeypoints,
                                  nbkeypoints)
        res = output.get()
        if (PRINT_KEYPOINTS):
            print res
        count = counter.get()[0]
        t1 = time.time()
        ref, count_ref = my_compact(keypoints, nbkeypoints)
        t2 = time.time()

        print("Kernel counter : %s / Python counter : %s / True value : %s" %
              (count, count_ref, nbkeypoints - nb_ones))

        res_sort_arg = res[:, 2].argsort(axis=0)
        res_sort = res[res_sort_arg]
        ref_sort_arg = ref[:, 2].argsort(axis=0)
        ref_sort = ref[ref_sort_arg]
        if (PRINT_KEYPOINTS):
            print "Delta matrix :"
            print(abs(res_sort - ref_sort) > 1e-5).sum()
        delta = abs((res_sort - ref_sort)).max()
        self.assert_(delta < 1e-5, "delta=%s" % (delta))
        self.assertEqual(count, count_ref, "counters are the same")
        logger.info("delta=%s" % delta)
        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Compact operation took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Example #6
0
    def test_interpolation(self):
        """
        tests the keypoints interpolation kernel
        Requires the following: "self.keypoints1", "self.actual_nb_keypoints", 	"self.gpu_dog_prev", self.gpu_dog", 			"self.gpu_dog_next", "self.s", "self.width", "self.height", "self.peakthresh"
        """

        #interpolation_setup :
        border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, nb_keypoints, actual_nb_keypoints, width, height, DOGS, s, keypoints_prev, blur = interpolation_setup(
        )

        # actual_nb_keypoints is the number of keypoints returned by "local_maxmin".
        #After the interpolation, it will be reduced, but we can still use it as a boundary.
        shape = calc_size(keypoints_prev.shape, self.wg)
        gpu_dogs = pyopencl.array.to_device(queue, DOGS)
        gpu_keypoints1 = pyopencl.array.to_device(queue, keypoints_prev)
        #actual_nb_keypoints = numpy.int32(len((keypoints_prev[:,0])[keypoints_prev[:,1] != -1]))
        start_keypoints = numpy.int32(0)
        actual_nb_keypoints = numpy.int32(actual_nb_keypoints)
        InitSigma = numpy.float32(
            1.6)  #warning: it must be the same in my_keypoints_interpolation
        t0 = time.time()
        k1 = self.program.interp_keypoint(queue, shape, self.wg, gpu_dogs.data,
                                          gpu_keypoints1.data, start_keypoints,
                                          actual_nb_keypoints, peakthresh,
                                          InitSigma, width, height)
        res = gpu_keypoints1.get()

        t1 = time.time()
        ref = numpy.copy(keypoints_prev)  #important here
        for i, k in enumerate(ref[:nb_keypoints, :]):
            ref[i] = my_interp_keypoint(DOGS, s, k[1], k[2], 5, peakthresh,
                                        width, height)

        t2 = time.time()

        #we have to compare keypoints different from (-1,-1,-1,-1)
        res2 = res[res[:, 1] != -1]
        ref2 = ref[ref[:, 1] != -1]

        if (PRINT_KEYPOINTS):
            print("[s=%s]Keypoints before interpolation: %s" %
                  (s, actual_nb_keypoints))
            #print keypoints_prev[0:10,:]
            print("[s=%s]Keypoints after interpolation : %s" %
                  (s, res2.shape[0]))
            print res[0:actual_nb_keypoints]  #[0:10,:]
            #print("Ref:")
            #print ref[0:32,:]

        delta = abs(ref2 - res2).max()
        self.assert_(delta < 1e-4, "delta=%s" % (delta))
        logger.info("delta=%s" % delta)

        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Keypoints interpolation took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Example #7
0
    def test_gradient(self):
        """
        tests the gradient kernel (norm and orientation)
        """

        border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, scale, nb_keypoints, width, height, DOGS, g = local_maxmin_setup(
        )
        self.mat = numpy.ascontiguousarray(g[1])
        self.height, self.width = numpy.int32(self.mat.shape)
        self.gpu_mat = pyopencl.array.to_device(queue, self.mat)
        self.gpu_grad = pyopencl.array.empty(queue,
                                             self.mat.shape,
                                             dtype=numpy.float32,
                                             order="C")
        self.gpu_ori = pyopencl.array.empty(queue,
                                            self.mat.shape,
                                            dtype=numpy.float32,
                                            order="C")
        self.shape = calc_size((self.width, self.height), self.wg)

        t0 = time.time()
        k1 = self.program.compute_gradient_orientation(
            queue, self.shape, self.wg, self.gpu_mat.data, self.gpu_grad.data,
            self.gpu_ori.data, self.width, self.height)
        res_norm = self.gpu_grad.get()
        res_ori = self.gpu_ori.get()
        t1 = time.time()
        ref_norm, ref_ori = my_gradient(self.mat)
        t2 = time.time()
        delta_norm = abs(ref_norm - res_norm).max()
        delta_ori = abs(ref_ori - res_ori).max()
        if (PRINT_KEYPOINTS):
            rmin, cmin = 0, 0
            rmax, cmax = rmin + 6, cmin + 6

            print res_norm[-rmax, cmin:cmax]
            print ""
            print ref_norm[-rmax, cmin:cmax]
            fig = pylab.figure()
            sp1 = fig.add_subplot(121)
            sp1.imshow(res_norm, interpolation="nearest")
            sp2 = fig.add_subplot(122)
            sp2.imshow(ref_norm, interpolation="nearest")
            fig.show()
            raw_input("enter")

        self.assert_(delta_norm < 1e-4, "delta_norm=%s" % (delta_norm))
        self.assert_(delta_ori < 1e-4, "delta_ori=%s" % (delta_ori))
        logger.info("delta_norm=%s" % delta_norm)
        logger.info("delta_ori=%s" % delta_ori)

        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Gradient computation took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Example #8
0
    def test_interpolation(self):
        """
        tests the keypoints interpolation kernel
        Requires the following: "self.keypoints1", "self.actual_nb_keypoints", 	"self.gpu_dog_prev", self.gpu_dog", 			"self.gpu_dog_next", "self.s", "self.width", "self.height", "self.peakthresh"
        """

        #interpolation_setup :
        border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, nb_keypoints, actual_nb_keypoints, width, height, DOGS, s, keypoints_prev, blur = interpolation_setup()

        # actual_nb_keypoints is the number of keypoints returned by "local_maxmin".
        #After the interpolation, it will be reduced, but we can still use it as a boundary.
        shape = calc_size(keypoints_prev.shape, self.wg)
        gpu_dogs = pyopencl.array.to_device(queue, DOGS)
        gpu_keypoints1 = pyopencl.array.to_device(queue, keypoints_prev)
        #actual_nb_keypoints = numpy.int32(len((keypoints_prev[:,0])[keypoints_prev[:,1] != -1]))
        start_keypoints = numpy.int32(0)
        actual_nb_keypoints = numpy.int32(actual_nb_keypoints)
        InitSigma = numpy.float32(1.6) #warning: it must be the same in my_keypoints_interpolation
        t0 = time.time()
        k1 = self.program.interp_keypoint(queue, shape, self.wg,
        	gpu_dogs.data, gpu_keypoints1.data, start_keypoints, actual_nb_keypoints,
        	peakthresh, InitSigma, width, height)
        res = gpu_keypoints1.get()

        t1 = time.time()
        ref = numpy.copy(keypoints_prev) #important here
        for i, k in enumerate(ref[:nb_keypoints, :]):
            ref[i] = my_interp_keypoint(DOGS, s, k[1], k[2], 5, peakthresh, width, height)

        t2 = time.time()


        #we have to compare keypoints different from (-1,-1,-1,-1)
        res2 = res[res[:, 1] != -1]
        ref2 = ref[ref[:, 1] != -1]


        if (PRINT_KEYPOINTS):
            print("[s=%s]Keypoints before interpolation: %s" % (s, actual_nb_keypoints))
            #print keypoints_prev[0:10,:]
            print("[s=%s]Keypoints after interpolation : %s" % (s, res2.shape[0]))
            print res[0:actual_nb_keypoints]#[0:10,:]
            #print("Ref:")
            #print ref[0:32,:]


        delta = abs(ref2 - res2).max()
        self.assert_(delta < 1e-4, "delta=%s" % (delta))
        logger.info("delta=%s" % delta)

        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Keypoints interpolation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
Example #9
0
    def test_compact(self):
        """
        tests the "compact" kernel
        """

        nbkeypoints = 10000 #constant value
        keypoints = numpy.random.rand(nbkeypoints, 4).astype(numpy.float32)
        nb_ones = 0
        for i in range(0, nbkeypoints):
            if ((numpy.random.rand(1))[0] < 0.25):
                keypoints[i] = (-1, -1, i, -1)
                nb_ones += 1
            else: keypoints[i,2] = i

        gpu_keypoints = pyopencl.array.to_device(queue, keypoints)
        output = pyopencl.array.empty(queue, (nbkeypoints, 4), dtype=numpy.float32, order="C")
        output.fill(-1.0, queue)
        counter = pyopencl.array.zeros(queue, (1,), dtype=numpy.int32, order="C")
        wg = max(self.wg),
        shape = calc_size((keypoints.shape[0],), wg)
        nbkeypoints = numpy.int32(nbkeypoints)
        startkeypoints = numpy.int32(0)
        t0 = time.time()
        k1 = self.program.compact(queue, shape, wg,
            gpu_keypoints.data, output.data, counter.data, startkeypoints, nbkeypoints)
        res = output.get()
        if (PRINT_KEYPOINTS):
            print res
        count = counter.get()[0]
        t1 = time.time()
        ref, count_ref = my_compact(keypoints, nbkeypoints)
        t2 = time.time()

        print("Kernel counter : %s / Python counter : %s / True value : %s" % (count, count_ref, nbkeypoints - nb_ones))

        res_sort_arg = res[:, 2].argsort(axis=0)
        res_sort = res[res_sort_arg]
        ref_sort_arg = ref[:, 2].argsort(axis=0)
        ref_sort = ref[ref_sort_arg]
        if (PRINT_KEYPOINTS):
            print "Delta matrix :"
            print (abs(res_sort - ref_sort) > 1e-5).sum()
        delta = abs((res_sort - ref_sort)).max()
        self.assert_(delta < 1e-5, "delta=%s" % (delta))
        self.assertEqual(count, count_ref, "counters are the same")
        logger.info("delta=%s" % delta)
        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Compact operation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
Example #10
0
    def setUp(self):
        self.input = scipy.misc.lena().astype(numpy.float32)
        self.input = numpy.ascontiguousarray(self.input[0:507,0:209])
        
        self.gpu_in = pyopencl.array.to_device(queue, self.input)
        self.gpu_tmp = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C")
        self.gpu_out = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C")
        kernel_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), "convolution.cl")
        kernel_src = open(kernel_path).read()
#        compile_options = "-D NIMAGE=%i" % self.input.size
#        logger.info("Compiling file %s with options %s" % (kernel_path, compile_options))
#        self.program = pyopencl.Program(ctx, kernel_src).build(options=compile_options)
        self.program = pyopencl.Program(ctx, kernel_src).build()
        self.IMAGE_W = numpy.int32(self.input.shape[-1])
        self.IMAGE_H = numpy.int32(self.input.shape[0])
        self.wg = (256, 2)
        self.shape = calc_size((self.input.shape[1], self.input.shape[0]), self.wg)
Example #11
0
    def test_gradient(self):
        """
        tests the gradient kernel (norm and orientation)
        """
        
        border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, scale, nb_keypoints, width, height, DOGS, g = local_maxmin_setup()
        self.mat = numpy.ascontiguousarray(g[1])
        self.height, self.width = numpy.int32(self.mat.shape)
        self.gpu_mat = pyopencl.array.to_device(queue, self.mat)
        self.gpu_grad = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C")
        self.gpu_ori = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C")
        self.shape = calc_size((self.width, self.height), self.wg)

        t0 = time.time()
        k1 = self.program.compute_gradient_orientation(queue, self.shape, self.wg, self.gpu_mat.data, self.gpu_grad.data, self.gpu_ori.data, self.width, self.height)
        res_norm = self.gpu_grad.get()
        res_ori = self.gpu_ori.get()
        t1 = time.time()
        ref_norm, ref_ori = my_gradient(self.mat)
        t2 = time.time()
        delta_norm = abs(ref_norm - res_norm).max()
        delta_ori = abs(ref_ori - res_ori).max()
        if (PRINT_KEYPOINTS):
            rmin, cmin = 0, 0
            rmax, cmax = rmin+6, cmin+6
            
            print res_norm[-rmax,cmin:cmax]
            print ""
            print ref_norm[-rmax,cmin:cmax]
            fig = pylab.figure()
            sp1 = fig.add_subplot(121)
            sp1.imshow(res_norm, interpolation="nearest")
            sp2 = fig.add_subplot(122)
            sp2.imshow(ref_norm, interpolation="nearest")
            fig.show()
            raw_input("enter")
        
        self.assert_(delta_norm < 1e-4, "delta_norm=%s" % (delta_norm))
        self.assert_(delta_ori < 1e-4, "delta_ori=%s" % (delta_ori))
        logger.info("delta_norm=%s" % delta_norm)
        logger.info("delta_ori=%s" % delta_ori)

        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Gradient computation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
Example #12
0
    def setUp(self):
        self.input = numpy.ascontiguousarray(scipy.misc.lena()[:510, :511])
        self.gpudata = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C")
        kernel_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), "preprocess.cl")
        reduct_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), "reductions.cl")
        kernel_src = open(kernel_path).read()
        reduct_src = open(reduct_path).read()
        self.program = pyopencl.Program(ctx, kernel_src).build()
        self.reduction = pyopencl.Program(ctx, reduct_src).build()
        self.IMAGE_W = numpy.int32(self.input.shape[-1])
        self.IMAGE_H = numpy.int32(self.input.shape[0])
        self.wg = (32, 16)#(256, 2) #(32, 16) # (2, 256)
        self.shape = calc_size((self.IMAGE_W, self.IMAGE_H), self.wg)
#        print self.shape
        self.binning = (4, 2) # Nota if wg < ouptup size weired results are expected !
#        self.binning = (2, 2)
        self.red_size = 128 #reduction size
        self.twofivefive = pyopencl.array.to_device(queue, numpy.array([255], numpy.float32))
        self.buffers_max_min = pyopencl.array.empty(queue, (self.red_size, 2), dtype=numpy.float32)  # temporary buffer for max/min reduction
        self.buffers_min = pyopencl.array.empty(queue, (1), dtype=numpy.float32)
        self.buffers_max = pyopencl.array.empty(queue, (1), dtype=numpy.float32)
Example #13
0
    def test_bin(self):
        """
        Test binning kernel
        """
        lint = numpy.ascontiguousarray(self.input, numpy.float32)

        out_shape = tuple(int(math.ceil((float(i) / j))) for i, j in zip(self.input.shape, self.binning))
        t0 = time.time()
        inp_gpu = pyopencl.array.to_device(queue, lint)
        out_gpu = pyopencl.array.empty(queue, out_shape, dtype=numpy.float32, order="C")
        k1 = self.program.bin(queue, calc_size((out_shape[1], out_shape[0]), self.wg), self.wg, inp_gpu.data, out_gpu.data,
                                 numpy.int32(self.binning[1]), numpy.int32(self.binning[0]),
                                 numpy.int32(lint.shape[1]), numpy.int32(lint.shape[0]),
                                 numpy.int32(out_shape[1]), numpy.int32(out_shape[0]))
        res = out_gpu.get()
        t1 = time.time()
        ref = binning(lint, self.binning) / self.binning[0] / self.binning[1]
        t2 = time.time()
#        print ref.shape, res.shape
        delta = abs(ref - res).max()
        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Binning took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
            fig = pylab.figure()
            fig.suptitle('Binning by %s,%s' % self.binning)
            sp1 = fig.add_subplot(221)
            sp1.imshow(lint, interpolation="nearest")
            sp1.set_title("Input")
            sp2 = fig.add_subplot(222)
            sp2.imshow(ref, interpolation="nearest")
            sp2.set_title("Reference")
            sp3 = fig.add_subplot(223)
            sp3.imshow(ref - res, interpolation="nearest")
            sp3.set_title("Delta= %s" % delta)
            sp4 = fig.add_subplot(224)
            sp4.imshow(res, interpolation="nearest")
            sp4.set_title("GPU")
            fig.show()
            raw_input("enter")
        self.assert_(delta < 1e-6, "delta=%s" % delta)
Example #14
0
    def test_transform(self):
        '''
        tests transform kernel
        '''

        #original image
        image = scipy.misc.ascent().astype(numpy.float32)
        image = numpy.ascontiguousarray(image[0:512, 0:512])
        image_height, image_width = image.shape
        #transformation
        angle = 1.9  #numpy.pi/5.0
        #        matrix = numpy.array([[numpy.cos(angle),-numpy.sin(angle)],[numpy.sin(angle),numpy.cos(angle)]],dtype=numpy.float32)
        #        offset_value = numpy.array([1000.0, 100.0],dtype=numpy.float32)
        #        matrix = numpy.array([[0.9,0.2],[-0.4,0.9]],dtype=numpy.float32)
        #        offset_value = numpy.array([-20.0,256.0],dtype=numpy.float32)
        matrix = numpy.array([[1.0, -0.75], [0.7, 0.5]], dtype=numpy.float32)

        offset_value = numpy.array([250.0, -150.0], dtype=numpy.float32)

        image2 = scipy.ndimage.interpolation.affine_transform(
            image, matrix, offset=offset_value, order=1, mode="constant")

        fill_value = numpy.float32(0.0)
        mode = numpy.int32(1)

        output_height, output_width = image_height * 2, image_width * 2
        image, image_height, image_width = self.image_reshape(
            image, output_height, output_width, image_height, image_width)
        image2, image2_height, image2_width = self.image_reshape(
            image2, output_height, output_width, image2.shape[1],
            image2.shape[0])
        print "Image : (%s, %s) -- Output: (%s, %s)" % (
            image_height, image_width, output_height, output_width)

        #perform correction by least square
        sol, MSE = self.matching_correction(image, image2)
        print sol

        correction_matrix = numpy.zeros((2, 2), dtype=numpy.float32)
        correction_matrix[0] = sol[0:2, 0]
        correction_matrix[1] = sol[3:5, 0]
        matrix_for_gpu = correction_matrix.reshape(4, 1)  #for float4 struct
        offset_value[0] = sol[2, 0]
        offset_value[1] = sol[5, 0]

        wg = 8, 8
        shape = calc_size((output_width, output_height), wg)
        gpu_image = pyopencl.array.to_device(queue, image2)
        gpu_output = pyopencl.array.empty(queue, (output_height, output_width),
                                          dtype=numpy.float32,
                                          order="C")
        gpu_matrix = pyopencl.array.to_device(queue, matrix_for_gpu)
        gpu_offset = pyopencl.array.to_device(queue, offset_value)
        image_height, image_width = numpy.int32((image_height, image_width))
        output_height, output_width = numpy.int32(
            (output_height, output_width))

        t0 = time.time()
        k1 = self.program.transform(queue, shape, wg, gpu_image.data,
                                    gpu_output.data, gpu_matrix.data,
                                    gpu_offset.data, image_width, image_height,
                                    output_width, output_height, fill_value,
                                    mode)
        res = gpu_output.get()
        t1 = time.time()
        #        print res[0,0]

        ref = scipy.ndimage.interpolation.affine_transform(
            image2,
            correction_matrix,
            offset=offset_value,
            output_shape=(output_height, output_width),
            order=1,
            mode="constant",
            cval=fill_value)
        t2 = time.time()

        delta = abs(res - image)
        delta_arg = delta.argmax()
        delta_max = delta.max()
        #        delta_mse_res = ((res-image)**2).sum()/image.size
        #        delta_mse_ref = ((ref-image)**2).sum()/image.size
        at_0, at_1 = delta_arg / output_width, delta_arg % output_width
        print("Max error: %f at (%d, %d)" % (delta_max, at_0, at_1))
        #        print("Mean Squared Error Res/Original : %f" %(delta_mse_res))
        #        print("Mean Squared Error Ref/Original: %f" %(delta_mse_ref))
        print("minimal MSE according to least squares : %f" % MSE)
        #        print res[at_0,at_1]
        #        print ref[at_0,at_1]

        SHOW_FIGURES = True
        if SHOW_FIGURES:
            fig = pylab.figure()
            sp1 = fig.add_subplot(221, title="Input image")
            sp1.imshow(image, interpolation="nearest")
            sp2 = fig.add_subplot(222, title="Image after deformation")
            sp2.imshow(image2, interpolation="nearest")
            sp2 = fig.add_subplot(223, title="Corrected image (OpenCL)")
            sp2.imshow(res, interpolation="nearest")
            sp2 = fig.add_subplot(224, title="Corrected image (Scipy)")
            sp2.imshow(ref, interpolation="nearest")
            #            sp2.imshow(ref, interpolation="nearest")
            #            sp3 = fig.add_subplot(223,title="delta (max = %f)" %delta_max)
            #            sh3 = sp3.imshow(delta[:,:], interpolation="nearest")
            #            cbar = fig.colorbar(sh3)
            fig.show()
            raw_input("enter")

        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Transformation took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Example #15
0
    def test_local_maxmin(self):
        """
        tests the local maximum/minimum detection kernel
        """
        #local_maxmin_setup :
        border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, s, nb_keypoints, width, height, DOGS, g = local_maxmin_setup(
        )
        self.s = numpy.int32(s)  #1, 2, 3 ... not 4 nor 0.
        self.gpu_dogs = pyopencl.array.to_device(queue, DOGS)
        self.output = pyopencl.array.empty(queue, (nb_keypoints, 4),
                                           dtype=numpy.float32,
                                           order="C")
        self.output.fill(-1.0, queue)  #memset for invalid keypoints
        self.counter = pyopencl.array.zeros(queue, (1, ),
                                            dtype=numpy.int32,
                                            order="C")
        nb_keypoints = numpy.int32(nb_keypoints)
        self.shape = calc_size((DOGS.shape[1], DOGS.shape[0] * DOGS.shape[2]),
                               self.wg)  #it's a 3D vector !!

        t0 = time.time()
        k1 = self.program.local_maxmin(queue, self.shape, self.wg,
                                       self.gpu_dogs.data, self.output.data,
                                       border_dist, peakthresh, octsize,
                                       EdgeThresh0, EdgeThresh,
                                       self.counter.data, nb_keypoints, self.s,
                                       width, height)

        res = self.output.get()
        self.keypoints1 = self.output  #for further use
        self.actual_nb_keypoints = self.counter.get()[0]  #for further use

        t1 = time.time()
        ref, actual_nb_keypoints2 = my_local_maxmin(DOGS, peakthresh,
                                                    border_dist, octsize,
                                                    EdgeThresh0, EdgeThresh,
                                                    nb_keypoints, self.s,
                                                    width, height)
        t2 = time.time()

        #we have to sort the arrays, for peaks orders is unknown for GPU
        res_peaks = res[(res[:, 0].argsort(axis=0)), 0]
        ref_peaks = ref[(ref[:, 0].argsort(axis=0)), 0]
        res_r = res[(res[:, 1].argsort(axis=0)), 1]
        ref_r = ref[(ref[:, 1].argsort(axis=0)), 1]
        res_c = res[(res[:, 2].argsort(axis=0)), 2]
        ref_c = ref[(ref[:, 2].argsort(axis=0)), 2]
        #res_s = res[(res[:,3].argsort(axis=0)),3]
        #ref_s = ref[(ref[:,3].argsort(axis=0)),3]
        delta_peaks = abs(ref_peaks - res_peaks).max()
        delta_r = abs(ref_r - res_r).max()
        delta_c = abs(ref_c - res_c).max()

        if (PRINT_KEYPOINTS):
            print(
                "keypoints after 2 steps of refinement: (s= %s, octsize=%s) %s"
                % (self.s, octsize, self.actual_nb_keypoints))
            #print("For ref: %s" %(ref_peaks[ref_peaks!=-1].shape))
            print res[0:self.actual_nb_keypoints]  #[0:74]
            #print ref[0:32]

        self.assert_(delta_peaks < 1e-4, "delta_peaks=%s" % (delta_peaks))
        self.assert_(delta_r < 1e-4, "delta_r=%s" % (delta_r))
        self.assert_(delta_c < 1e-4, "delta_c=%s" % (delta_c))
        logger.info("delta_peaks=%s" % delta_peaks)
        logger.info("delta_r=%s" % delta_r)
        logger.info("delta_c=%s" % delta_c)

        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Local extrema search took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
Example #16
0
    def test_transform(self):
        '''
        tests transform kernel
        '''




        #original image
        image = scipy.misc.ascent().astype(numpy.float32)
        image = numpy.ascontiguousarray(image[0:512,0:512])
        image_height, image_width = image.shape
        #transformation
        angle = 1.9 #numpy.pi/5.0
#        matrix = numpy.array([[numpy.cos(angle),-numpy.sin(angle)],[numpy.sin(angle),numpy.cos(angle)]],dtype=numpy.float32)
#        offset_value = numpy.array([1000.0, 100.0],dtype=numpy.float32)
#        matrix = numpy.array([[0.9,0.2],[-0.4,0.9]],dtype=numpy.float32)
#        offset_value = numpy.array([-20.0,256.0],dtype=numpy.float32)
        matrix = numpy.array([[1.0,-0.75],[0.7,0.5]],dtype=numpy.float32)

        offset_value = numpy.array([250.0, -150.0],dtype=numpy.float32)

        image2 = scipy.ndimage.interpolation.affine_transform(image,matrix,offset=offset_value,order=1, mode="constant")



        fill_value = numpy.float32(0.0)
        mode = numpy.int32(1)

        output_height, output_width = image_height*2, image_width*2
        image, image_height, image_width = self.image_reshape(image,output_height,output_width,image_height,image_width)
        image2, image2_height, image2_width = self.image_reshape(image2,output_height,output_width,image2.shape[1],image2.shape[0])
        print "Image : (%s, %s) -- Output: (%s, %s)" %(image_height, image_width , output_height, output_width)


        #perform correction by least square
        sol, MSE = self.matching_correction(image,image2)
        print sol

        correction_matrix = numpy.zeros((2,2),dtype=numpy.float32)
        correction_matrix[0] = sol[0:2,0]
        correction_matrix[1] = sol[3:5,0]
        matrix_for_gpu = correction_matrix.reshape(4,1) #for float4 struct
        offset_value[0] = sol[2,0]
        offset_value[1] = sol[5,0]

        wg = 8,8
        shape = calc_size((output_width,output_height), wg)
        gpu_image = pyopencl.array.to_device(queue, image2)
        gpu_output = pyopencl.array.empty(queue, (output_height, output_width), dtype=numpy.float32, order="C")
        gpu_matrix = pyopencl.array.to_device(queue,matrix_for_gpu)
        gpu_offset = pyopencl.array.to_device(queue,offset_value)
        image_height, image_width = numpy.int32((image_height, image_width))
        output_height, output_width = numpy.int32((output_height, output_width))

        t0 = time.time()
        k1 = self.program.transform(queue, shape, wg,
                gpu_image.data, gpu_output.data, gpu_matrix.data, gpu_offset.data,
                image_width, image_height, output_width, output_height, fill_value, mode)
        res = gpu_output.get()
        t1 = time.time()
#        print res[0,0]

        ref = scipy.ndimage.interpolation.affine_transform(image2,correction_matrix,
            offset=offset_value, output_shape=(output_height,output_width),order=1, mode="constant", cval=fill_value)
        t2 = time.time()

        delta = abs(res-image)
        delta_arg = delta.argmax()
        delta_max = delta.max()
#        delta_mse_res = ((res-image)**2).sum()/image.size
#        delta_mse_ref = ((ref-image)**2).sum()/image.size
        at_0, at_1 = delta_arg/output_width, delta_arg%output_width
        print("Max error: %f at (%d, %d)" %(delta_max, at_0, at_1))
#        print("Mean Squared Error Res/Original : %f" %(delta_mse_res))
#        print("Mean Squared Error Ref/Original: %f" %(delta_mse_ref))
        print("minimal MSE according to least squares : %f" %MSE)
#        print res[at_0,at_1]
#        print ref[at_0,at_1]

        SHOW_FIGURES = True
        if SHOW_FIGURES:
            fig = pylab.figure()
            sp1 = fig.add_subplot(221,title="Input image")
            sp1.imshow(image, interpolation="nearest")
            sp2 = fig.add_subplot(222,title="Image after deformation")
            sp2.imshow(image2, interpolation="nearest")
            sp2 = fig.add_subplot(223,title="Corrected image (OpenCL)")
            sp2.imshow(res, interpolation="nearest")
            sp2 = fig.add_subplot(224,title="Corrected image (Scipy)")
            sp2.imshow(ref, interpolation="nearest")
#            sp2.imshow(ref, interpolation="nearest")
#            sp3 = fig.add_subplot(223,title="delta (max = %f)" %delta_max)
#            sh3 = sp3.imshow(delta[:,:], interpolation="nearest")
#            cbar = fig.colorbar(sh3)
            fig.show()
            raw_input("enter")


        if PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Transformation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))