def test_combine(self): """ tests the combine (linear combination) kernel """ width = numpy.int32(157) height = numpy.int32(147) coeff1 = numpy.random.rand(1)[0].astype(numpy.float32) coeff2 = numpy.random.rand(1)[0].astype(numpy.float32) mat1 = numpy.random.rand(height, width).astype(numpy.float32) mat2 = numpy.random.rand(height, width).astype(numpy.float32) gpu_mat1 = pyopencl.array.to_device(queue, mat1) gpu_mat2 = pyopencl.array.to_device(queue, mat2) gpu_out = pyopencl.array.empty(queue, mat1.shape, dtype=numpy.float32, order="C") shape = calc_size((width, height), self.wg) t0 = time.time() k1 = self.program.combine(queue, shape, self.wg, gpu_mat1.data, coeff1, gpu_mat2.data, coeff2, gpu_out.data, numpy.int32(0), width, height) res = gpu_out.get() t1 = time.time() ref = my_combine(mat1, coeff1, mat2, coeff2) t2 = time.time() delta = abs(ref - res).max() logger.info("delta=%s" % delta) self.assert_(delta < 1e-4, "delta=%s" % (delta)) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Linear combination took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def setUp(self): self.input = scipy.misc.lena().astype(numpy.float32) self.input = numpy.ascontiguousarray(self.input[0:507, 0:209]) self.gpu_in = pyopencl.array.to_device(queue, self.input) self.gpu_tmp = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C") self.gpu_out = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C") kernel_path = os.path.join( os.path.dirname(os.path.abspath(sift.__file__)), "convolution.cl") kernel_src = open(kernel_path).read() # compile_options = "-D NIMAGE=%i" % self.input.size # logger.info("Compiling file %s with options %s" % (kernel_path, compile_options)) # self.program = pyopencl.Program(ctx, kernel_src).build(options=compile_options) self.program = pyopencl.Program(ctx, kernel_src).build() self.IMAGE_W = numpy.int32(self.input.shape[-1]) self.IMAGE_H = numpy.int32(self.input.shape[0]) self.wg = (256, 2) self.shape = calc_size((self.input.shape[1], self.input.shape[0]), self.wg)
def test_local_maxmin(self): """ tests the local maximum/minimum detection kernel """ #local_maxmin_setup : border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, s, nb_keypoints, width, height, DOGS, g = local_maxmin_setup() self.s = numpy.int32(s) #1, 2, 3 ... not 4 nor 0. self.gpu_dogs = pyopencl.array.to_device(queue, DOGS) self.output = pyopencl.array.empty(queue, (nb_keypoints, 4), dtype=numpy.float32, order="C") self.output.fill(-1.0, queue) #memset for invalid keypoints self.counter = pyopencl.array.zeros(queue, (1,), dtype=numpy.int32, order="C") nb_keypoints = numpy.int32(nb_keypoints) self.shape = calc_size((DOGS.shape[1], DOGS.shape[0] * DOGS.shape[2]), self.wg) #it's a 3D vector !! t0 = time.time() k1 = self.program.local_maxmin(queue, self.shape, self.wg, self.gpu_dogs.data, self.output.data, border_dist, peakthresh, octsize, EdgeThresh0, EdgeThresh, self.counter.data, nb_keypoints, self.s, width, height) res = self.output.get() self.keypoints1 = self.output #for further use self.actual_nb_keypoints = self.counter.get()[0] #for further use t1 = time.time() ref, actual_nb_keypoints2 = my_local_maxmin(DOGS, peakthresh, border_dist, octsize, EdgeThresh0, EdgeThresh, nb_keypoints, self.s, width, height) t2 = time.time() #we have to sort the arrays, for peaks orders is unknown for GPU res_peaks = res[(res[:, 0].argsort(axis=0)), 0] ref_peaks = ref[(ref[:, 0].argsort(axis=0)), 0] res_r = res[(res[:, 1].argsort(axis=0)), 1] ref_r = ref[(ref[:, 1].argsort(axis=0)), 1] res_c = res[(res[:, 2].argsort(axis=0)), 2] ref_c = ref[(ref[:, 2].argsort(axis=0)), 2] #res_s = res[(res[:,3].argsort(axis=0)),3] #ref_s = ref[(ref[:,3].argsort(axis=0)),3] delta_peaks = abs(ref_peaks - res_peaks).max() delta_r = abs(ref_r - res_r).max() delta_c = abs(ref_c - res_c).max() if (PRINT_KEYPOINTS): print("keypoints after 2 steps of refinement: (s= %s, octsize=%s) %s" % (self.s, octsize, self.actual_nb_keypoints)) #print("For ref: %s" %(ref_peaks[ref_peaks!=-1].shape)) print res[0:self.actual_nb_keypoints]#[0:74] #print ref[0:32] self.assert_(delta_peaks < 1e-4, "delta_peaks=%s" % (delta_peaks)) self.assert_(delta_r < 1e-4, "delta_r=%s" % (delta_r)) self.assert_(delta_c < 1e-4, "delta_c=%s" % (delta_c)) logger.info("delta_peaks=%s" % delta_peaks) logger.info("delta_r=%s" % delta_r) logger.info("delta_c=%s" % delta_c) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Local extrema search took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_compact(self): """ tests the "compact" kernel """ nbkeypoints = 10000 #constant value keypoints = numpy.random.rand(nbkeypoints, 4).astype(numpy.float32) nb_ones = 0 for i in range(0, nbkeypoints): if ((numpy.random.rand(1))[0] < 0.25): keypoints[i] = (-1, -1, i, -1) nb_ones += 1 else: keypoints[i, 2] = i gpu_keypoints = pyopencl.array.to_device(queue, keypoints) output = pyopencl.array.empty(queue, (nbkeypoints, 4), dtype=numpy.float32, order="C") output.fill(-1.0, queue) counter = pyopencl.array.zeros(queue, (1, ), dtype=numpy.int32, order="C") wg = max(self.wg), shape = calc_size((keypoints.shape[0], ), wg) nbkeypoints = numpy.int32(nbkeypoints) startkeypoints = numpy.int32(0) t0 = time.time() k1 = self.program.compact(queue, shape, wg, gpu_keypoints.data, output.data, counter.data, startkeypoints, nbkeypoints) res = output.get() if (PRINT_KEYPOINTS): print res count = counter.get()[0] t1 = time.time() ref, count_ref = my_compact(keypoints, nbkeypoints) t2 = time.time() print("Kernel counter : %s / Python counter : %s / True value : %s" % (count, count_ref, nbkeypoints - nb_ones)) res_sort_arg = res[:, 2].argsort(axis=0) res_sort = res[res_sort_arg] ref_sort_arg = ref[:, 2].argsort(axis=0) ref_sort = ref[ref_sort_arg] if (PRINT_KEYPOINTS): print "Delta matrix :" print(abs(res_sort - ref_sort) > 1e-5).sum() delta = abs((res_sort - ref_sort)).max() self.assert_(delta < 1e-5, "delta=%s" % (delta)) self.assertEqual(count, count_ref, "counters are the same") logger.info("delta=%s" % delta) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Compact operation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_interpolation(self): """ tests the keypoints interpolation kernel Requires the following: "self.keypoints1", "self.actual_nb_keypoints", "self.gpu_dog_prev", self.gpu_dog", "self.gpu_dog_next", "self.s", "self.width", "self.height", "self.peakthresh" """ #interpolation_setup : border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, nb_keypoints, actual_nb_keypoints, width, height, DOGS, s, keypoints_prev, blur = interpolation_setup( ) # actual_nb_keypoints is the number of keypoints returned by "local_maxmin". #After the interpolation, it will be reduced, but we can still use it as a boundary. shape = calc_size(keypoints_prev.shape, self.wg) gpu_dogs = pyopencl.array.to_device(queue, DOGS) gpu_keypoints1 = pyopencl.array.to_device(queue, keypoints_prev) #actual_nb_keypoints = numpy.int32(len((keypoints_prev[:,0])[keypoints_prev[:,1] != -1])) start_keypoints = numpy.int32(0) actual_nb_keypoints = numpy.int32(actual_nb_keypoints) InitSigma = numpy.float32( 1.6) #warning: it must be the same in my_keypoints_interpolation t0 = time.time() k1 = self.program.interp_keypoint(queue, shape, self.wg, gpu_dogs.data, gpu_keypoints1.data, start_keypoints, actual_nb_keypoints, peakthresh, InitSigma, width, height) res = gpu_keypoints1.get() t1 = time.time() ref = numpy.copy(keypoints_prev) #important here for i, k in enumerate(ref[:nb_keypoints, :]): ref[i] = my_interp_keypoint(DOGS, s, k[1], k[2], 5, peakthresh, width, height) t2 = time.time() #we have to compare keypoints different from (-1,-1,-1,-1) res2 = res[res[:, 1] != -1] ref2 = ref[ref[:, 1] != -1] if (PRINT_KEYPOINTS): print("[s=%s]Keypoints before interpolation: %s" % (s, actual_nb_keypoints)) #print keypoints_prev[0:10,:] print("[s=%s]Keypoints after interpolation : %s" % (s, res2.shape[0])) print res[0:actual_nb_keypoints] #[0:10,:] #print("Ref:") #print ref[0:32,:] delta = abs(ref2 - res2).max() self.assert_(delta < 1e-4, "delta=%s" % (delta)) logger.info("delta=%s" % delta) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Keypoints interpolation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_gradient(self): """ tests the gradient kernel (norm and orientation) """ border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, scale, nb_keypoints, width, height, DOGS, g = local_maxmin_setup( ) self.mat = numpy.ascontiguousarray(g[1]) self.height, self.width = numpy.int32(self.mat.shape) self.gpu_mat = pyopencl.array.to_device(queue, self.mat) self.gpu_grad = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C") self.gpu_ori = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C") self.shape = calc_size((self.width, self.height), self.wg) t0 = time.time() k1 = self.program.compute_gradient_orientation( queue, self.shape, self.wg, self.gpu_mat.data, self.gpu_grad.data, self.gpu_ori.data, self.width, self.height) res_norm = self.gpu_grad.get() res_ori = self.gpu_ori.get() t1 = time.time() ref_norm, ref_ori = my_gradient(self.mat) t2 = time.time() delta_norm = abs(ref_norm - res_norm).max() delta_ori = abs(ref_ori - res_ori).max() if (PRINT_KEYPOINTS): rmin, cmin = 0, 0 rmax, cmax = rmin + 6, cmin + 6 print res_norm[-rmax, cmin:cmax] print "" print ref_norm[-rmax, cmin:cmax] fig = pylab.figure() sp1 = fig.add_subplot(121) sp1.imshow(res_norm, interpolation="nearest") sp2 = fig.add_subplot(122) sp2.imshow(ref_norm, interpolation="nearest") fig.show() raw_input("enter") self.assert_(delta_norm < 1e-4, "delta_norm=%s" % (delta_norm)) self.assert_(delta_ori < 1e-4, "delta_ori=%s" % (delta_ori)) logger.info("delta_norm=%s" % delta_norm) logger.info("delta_ori=%s" % delta_ori) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Gradient computation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_interpolation(self): """ tests the keypoints interpolation kernel Requires the following: "self.keypoints1", "self.actual_nb_keypoints", "self.gpu_dog_prev", self.gpu_dog", "self.gpu_dog_next", "self.s", "self.width", "self.height", "self.peakthresh" """ #interpolation_setup : border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, nb_keypoints, actual_nb_keypoints, width, height, DOGS, s, keypoints_prev, blur = interpolation_setup() # actual_nb_keypoints is the number of keypoints returned by "local_maxmin". #After the interpolation, it will be reduced, but we can still use it as a boundary. shape = calc_size(keypoints_prev.shape, self.wg) gpu_dogs = pyopencl.array.to_device(queue, DOGS) gpu_keypoints1 = pyopencl.array.to_device(queue, keypoints_prev) #actual_nb_keypoints = numpy.int32(len((keypoints_prev[:,0])[keypoints_prev[:,1] != -1])) start_keypoints = numpy.int32(0) actual_nb_keypoints = numpy.int32(actual_nb_keypoints) InitSigma = numpy.float32(1.6) #warning: it must be the same in my_keypoints_interpolation t0 = time.time() k1 = self.program.interp_keypoint(queue, shape, self.wg, gpu_dogs.data, gpu_keypoints1.data, start_keypoints, actual_nb_keypoints, peakthresh, InitSigma, width, height) res = gpu_keypoints1.get() t1 = time.time() ref = numpy.copy(keypoints_prev) #important here for i, k in enumerate(ref[:nb_keypoints, :]): ref[i] = my_interp_keypoint(DOGS, s, k[1], k[2], 5, peakthresh, width, height) t2 = time.time() #we have to compare keypoints different from (-1,-1,-1,-1) res2 = res[res[:, 1] != -1] ref2 = ref[ref[:, 1] != -1] if (PRINT_KEYPOINTS): print("[s=%s]Keypoints before interpolation: %s" % (s, actual_nb_keypoints)) #print keypoints_prev[0:10,:] print("[s=%s]Keypoints after interpolation : %s" % (s, res2.shape[0])) print res[0:actual_nb_keypoints]#[0:10,:] #print("Ref:") #print ref[0:32,:] delta = abs(ref2 - res2).max() self.assert_(delta < 1e-4, "delta=%s" % (delta)) logger.info("delta=%s" % delta) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Keypoints interpolation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_compact(self): """ tests the "compact" kernel """ nbkeypoints = 10000 #constant value keypoints = numpy.random.rand(nbkeypoints, 4).astype(numpy.float32) nb_ones = 0 for i in range(0, nbkeypoints): if ((numpy.random.rand(1))[0] < 0.25): keypoints[i] = (-1, -1, i, -1) nb_ones += 1 else: keypoints[i,2] = i gpu_keypoints = pyopencl.array.to_device(queue, keypoints) output = pyopencl.array.empty(queue, (nbkeypoints, 4), dtype=numpy.float32, order="C") output.fill(-1.0, queue) counter = pyopencl.array.zeros(queue, (1,), dtype=numpy.int32, order="C") wg = max(self.wg), shape = calc_size((keypoints.shape[0],), wg) nbkeypoints = numpy.int32(nbkeypoints) startkeypoints = numpy.int32(0) t0 = time.time() k1 = self.program.compact(queue, shape, wg, gpu_keypoints.data, output.data, counter.data, startkeypoints, nbkeypoints) res = output.get() if (PRINT_KEYPOINTS): print res count = counter.get()[0] t1 = time.time() ref, count_ref = my_compact(keypoints, nbkeypoints) t2 = time.time() print("Kernel counter : %s / Python counter : %s / True value : %s" % (count, count_ref, nbkeypoints - nb_ones)) res_sort_arg = res[:, 2].argsort(axis=0) res_sort = res[res_sort_arg] ref_sort_arg = ref[:, 2].argsort(axis=0) ref_sort = ref[ref_sort_arg] if (PRINT_KEYPOINTS): print "Delta matrix :" print (abs(res_sort - ref_sort) > 1e-5).sum() delta = abs((res_sort - ref_sort)).max() self.assert_(delta < 1e-5, "delta=%s" % (delta)) self.assertEqual(count, count_ref, "counters are the same") logger.info("delta=%s" % delta) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Compact operation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def setUp(self): self.input = scipy.misc.lena().astype(numpy.float32) self.input = numpy.ascontiguousarray(self.input[0:507,0:209]) self.gpu_in = pyopencl.array.to_device(queue, self.input) self.gpu_tmp = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C") self.gpu_out = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C") kernel_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), "convolution.cl") kernel_src = open(kernel_path).read() # compile_options = "-D NIMAGE=%i" % self.input.size # logger.info("Compiling file %s with options %s" % (kernel_path, compile_options)) # self.program = pyopencl.Program(ctx, kernel_src).build(options=compile_options) self.program = pyopencl.Program(ctx, kernel_src).build() self.IMAGE_W = numpy.int32(self.input.shape[-1]) self.IMAGE_H = numpy.int32(self.input.shape[0]) self.wg = (256, 2) self.shape = calc_size((self.input.shape[1], self.input.shape[0]), self.wg)
def test_gradient(self): """ tests the gradient kernel (norm and orientation) """ border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, scale, nb_keypoints, width, height, DOGS, g = local_maxmin_setup() self.mat = numpy.ascontiguousarray(g[1]) self.height, self.width = numpy.int32(self.mat.shape) self.gpu_mat = pyopencl.array.to_device(queue, self.mat) self.gpu_grad = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C") self.gpu_ori = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C") self.shape = calc_size((self.width, self.height), self.wg) t0 = time.time() k1 = self.program.compute_gradient_orientation(queue, self.shape, self.wg, self.gpu_mat.data, self.gpu_grad.data, self.gpu_ori.data, self.width, self.height) res_norm = self.gpu_grad.get() res_ori = self.gpu_ori.get() t1 = time.time() ref_norm, ref_ori = my_gradient(self.mat) t2 = time.time() delta_norm = abs(ref_norm - res_norm).max() delta_ori = abs(ref_ori - res_ori).max() if (PRINT_KEYPOINTS): rmin, cmin = 0, 0 rmax, cmax = rmin+6, cmin+6 print res_norm[-rmax,cmin:cmax] print "" print ref_norm[-rmax,cmin:cmax] fig = pylab.figure() sp1 = fig.add_subplot(121) sp1.imshow(res_norm, interpolation="nearest") sp2 = fig.add_subplot(122) sp2.imshow(ref_norm, interpolation="nearest") fig.show() raw_input("enter") self.assert_(delta_norm < 1e-4, "delta_norm=%s" % (delta_norm)) self.assert_(delta_ori < 1e-4, "delta_ori=%s" % (delta_ori)) logger.info("delta_norm=%s" % delta_norm) logger.info("delta_ori=%s" % delta_ori) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Gradient computation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def setUp(self): self.input = numpy.ascontiguousarray(scipy.misc.lena()[:510, :511]) self.gpudata = pyopencl.array.empty(queue, self.input.shape, dtype=numpy.float32, order="C") kernel_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), "preprocess.cl") reduct_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), "reductions.cl") kernel_src = open(kernel_path).read() reduct_src = open(reduct_path).read() self.program = pyopencl.Program(ctx, kernel_src).build() self.reduction = pyopencl.Program(ctx, reduct_src).build() self.IMAGE_W = numpy.int32(self.input.shape[-1]) self.IMAGE_H = numpy.int32(self.input.shape[0]) self.wg = (32, 16)#(256, 2) #(32, 16) # (2, 256) self.shape = calc_size((self.IMAGE_W, self.IMAGE_H), self.wg) # print self.shape self.binning = (4, 2) # Nota if wg < ouptup size weired results are expected ! # self.binning = (2, 2) self.red_size = 128 #reduction size self.twofivefive = pyopencl.array.to_device(queue, numpy.array([255], numpy.float32)) self.buffers_max_min = pyopencl.array.empty(queue, (self.red_size, 2), dtype=numpy.float32) # temporary buffer for max/min reduction self.buffers_min = pyopencl.array.empty(queue, (1), dtype=numpy.float32) self.buffers_max = pyopencl.array.empty(queue, (1), dtype=numpy.float32)
def test_bin(self): """ Test binning kernel """ lint = numpy.ascontiguousarray(self.input, numpy.float32) out_shape = tuple(int(math.ceil((float(i) / j))) for i, j in zip(self.input.shape, self.binning)) t0 = time.time() inp_gpu = pyopencl.array.to_device(queue, lint) out_gpu = pyopencl.array.empty(queue, out_shape, dtype=numpy.float32, order="C") k1 = self.program.bin(queue, calc_size((out_shape[1], out_shape[0]), self.wg), self.wg, inp_gpu.data, out_gpu.data, numpy.int32(self.binning[1]), numpy.int32(self.binning[0]), numpy.int32(lint.shape[1]), numpy.int32(lint.shape[0]), numpy.int32(out_shape[1]), numpy.int32(out_shape[0])) res = out_gpu.get() t1 = time.time() ref = binning(lint, self.binning) / self.binning[0] / self.binning[1] t2 = time.time() # print ref.shape, res.shape delta = abs(ref - res).max() if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Binning took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start))) fig = pylab.figure() fig.suptitle('Binning by %s,%s' % self.binning) sp1 = fig.add_subplot(221) sp1.imshow(lint, interpolation="nearest") sp1.set_title("Input") sp2 = fig.add_subplot(222) sp2.imshow(ref, interpolation="nearest") sp2.set_title("Reference") sp3 = fig.add_subplot(223) sp3.imshow(ref - res, interpolation="nearest") sp3.set_title("Delta= %s" % delta) sp4 = fig.add_subplot(224) sp4.imshow(res, interpolation="nearest") sp4.set_title("GPU") fig.show() raw_input("enter") self.assert_(delta < 1e-6, "delta=%s" % delta)
def test_transform(self): ''' tests transform kernel ''' #original image image = scipy.misc.ascent().astype(numpy.float32) image = numpy.ascontiguousarray(image[0:512, 0:512]) image_height, image_width = image.shape #transformation angle = 1.9 #numpy.pi/5.0 # matrix = numpy.array([[numpy.cos(angle),-numpy.sin(angle)],[numpy.sin(angle),numpy.cos(angle)]],dtype=numpy.float32) # offset_value = numpy.array([1000.0, 100.0],dtype=numpy.float32) # matrix = numpy.array([[0.9,0.2],[-0.4,0.9]],dtype=numpy.float32) # offset_value = numpy.array([-20.0,256.0],dtype=numpy.float32) matrix = numpy.array([[1.0, -0.75], [0.7, 0.5]], dtype=numpy.float32) offset_value = numpy.array([250.0, -150.0], dtype=numpy.float32) image2 = scipy.ndimage.interpolation.affine_transform( image, matrix, offset=offset_value, order=1, mode="constant") fill_value = numpy.float32(0.0) mode = numpy.int32(1) output_height, output_width = image_height * 2, image_width * 2 image, image_height, image_width = self.image_reshape( image, output_height, output_width, image_height, image_width) image2, image2_height, image2_width = self.image_reshape( image2, output_height, output_width, image2.shape[1], image2.shape[0]) print "Image : (%s, %s) -- Output: (%s, %s)" % ( image_height, image_width, output_height, output_width) #perform correction by least square sol, MSE = self.matching_correction(image, image2) print sol correction_matrix = numpy.zeros((2, 2), dtype=numpy.float32) correction_matrix[0] = sol[0:2, 0] correction_matrix[1] = sol[3:5, 0] matrix_for_gpu = correction_matrix.reshape(4, 1) #for float4 struct offset_value[0] = sol[2, 0] offset_value[1] = sol[5, 0] wg = 8, 8 shape = calc_size((output_width, output_height), wg) gpu_image = pyopencl.array.to_device(queue, image2) gpu_output = pyopencl.array.empty(queue, (output_height, output_width), dtype=numpy.float32, order="C") gpu_matrix = pyopencl.array.to_device(queue, matrix_for_gpu) gpu_offset = pyopencl.array.to_device(queue, offset_value) image_height, image_width = numpy.int32((image_height, image_width)) output_height, output_width = numpy.int32( (output_height, output_width)) t0 = time.time() k1 = self.program.transform(queue, shape, wg, gpu_image.data, gpu_output.data, gpu_matrix.data, gpu_offset.data, image_width, image_height, output_width, output_height, fill_value, mode) res = gpu_output.get() t1 = time.time() # print res[0,0] ref = scipy.ndimage.interpolation.affine_transform( image2, correction_matrix, offset=offset_value, output_shape=(output_height, output_width), order=1, mode="constant", cval=fill_value) t2 = time.time() delta = abs(res - image) delta_arg = delta.argmax() delta_max = delta.max() # delta_mse_res = ((res-image)**2).sum()/image.size # delta_mse_ref = ((ref-image)**2).sum()/image.size at_0, at_1 = delta_arg / output_width, delta_arg % output_width print("Max error: %f at (%d, %d)" % (delta_max, at_0, at_1)) # print("Mean Squared Error Res/Original : %f" %(delta_mse_res)) # print("Mean Squared Error Ref/Original: %f" %(delta_mse_ref)) print("minimal MSE according to least squares : %f" % MSE) # print res[at_0,at_1] # print ref[at_0,at_1] SHOW_FIGURES = True if SHOW_FIGURES: fig = pylab.figure() sp1 = fig.add_subplot(221, title="Input image") sp1.imshow(image, interpolation="nearest") sp2 = fig.add_subplot(222, title="Image after deformation") sp2.imshow(image2, interpolation="nearest") sp2 = fig.add_subplot(223, title="Corrected image (OpenCL)") sp2.imshow(res, interpolation="nearest") sp2 = fig.add_subplot(224, title="Corrected image (Scipy)") sp2.imshow(ref, interpolation="nearest") # sp2.imshow(ref, interpolation="nearest") # sp3 = fig.add_subplot(223,title="delta (max = %f)" %delta_max) # sh3 = sp3.imshow(delta[:,:], interpolation="nearest") # cbar = fig.colorbar(sh3) fig.show() raw_input("enter") if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Transformation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_local_maxmin(self): """ tests the local maximum/minimum detection kernel """ #local_maxmin_setup : border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, s, nb_keypoints, width, height, DOGS, g = local_maxmin_setup( ) self.s = numpy.int32(s) #1, 2, 3 ... not 4 nor 0. self.gpu_dogs = pyopencl.array.to_device(queue, DOGS) self.output = pyopencl.array.empty(queue, (nb_keypoints, 4), dtype=numpy.float32, order="C") self.output.fill(-1.0, queue) #memset for invalid keypoints self.counter = pyopencl.array.zeros(queue, (1, ), dtype=numpy.int32, order="C") nb_keypoints = numpy.int32(nb_keypoints) self.shape = calc_size((DOGS.shape[1], DOGS.shape[0] * DOGS.shape[2]), self.wg) #it's a 3D vector !! t0 = time.time() k1 = self.program.local_maxmin(queue, self.shape, self.wg, self.gpu_dogs.data, self.output.data, border_dist, peakthresh, octsize, EdgeThresh0, EdgeThresh, self.counter.data, nb_keypoints, self.s, width, height) res = self.output.get() self.keypoints1 = self.output #for further use self.actual_nb_keypoints = self.counter.get()[0] #for further use t1 = time.time() ref, actual_nb_keypoints2 = my_local_maxmin(DOGS, peakthresh, border_dist, octsize, EdgeThresh0, EdgeThresh, nb_keypoints, self.s, width, height) t2 = time.time() #we have to sort the arrays, for peaks orders is unknown for GPU res_peaks = res[(res[:, 0].argsort(axis=0)), 0] ref_peaks = ref[(ref[:, 0].argsort(axis=0)), 0] res_r = res[(res[:, 1].argsort(axis=0)), 1] ref_r = ref[(ref[:, 1].argsort(axis=0)), 1] res_c = res[(res[:, 2].argsort(axis=0)), 2] ref_c = ref[(ref[:, 2].argsort(axis=0)), 2] #res_s = res[(res[:,3].argsort(axis=0)),3] #ref_s = ref[(ref[:,3].argsort(axis=0)),3] delta_peaks = abs(ref_peaks - res_peaks).max() delta_r = abs(ref_r - res_r).max() delta_c = abs(ref_c - res_c).max() if (PRINT_KEYPOINTS): print( "keypoints after 2 steps of refinement: (s= %s, octsize=%s) %s" % (self.s, octsize, self.actual_nb_keypoints)) #print("For ref: %s" %(ref_peaks[ref_peaks!=-1].shape)) print res[0:self.actual_nb_keypoints] #[0:74] #print ref[0:32] self.assert_(delta_peaks < 1e-4, "delta_peaks=%s" % (delta_peaks)) self.assert_(delta_r < 1e-4, "delta_r=%s" % (delta_r)) self.assert_(delta_c < 1e-4, "delta_c=%s" % (delta_c)) logger.info("delta_peaks=%s" % delta_peaks) logger.info("delta_r=%s" % delta_r) logger.info("delta_c=%s" % delta_c) if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Local extrema search took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_transform(self): ''' tests transform kernel ''' #original image image = scipy.misc.ascent().astype(numpy.float32) image = numpy.ascontiguousarray(image[0:512,0:512]) image_height, image_width = image.shape #transformation angle = 1.9 #numpy.pi/5.0 # matrix = numpy.array([[numpy.cos(angle),-numpy.sin(angle)],[numpy.sin(angle),numpy.cos(angle)]],dtype=numpy.float32) # offset_value = numpy.array([1000.0, 100.0],dtype=numpy.float32) # matrix = numpy.array([[0.9,0.2],[-0.4,0.9]],dtype=numpy.float32) # offset_value = numpy.array([-20.0,256.0],dtype=numpy.float32) matrix = numpy.array([[1.0,-0.75],[0.7,0.5]],dtype=numpy.float32) offset_value = numpy.array([250.0, -150.0],dtype=numpy.float32) image2 = scipy.ndimage.interpolation.affine_transform(image,matrix,offset=offset_value,order=1, mode="constant") fill_value = numpy.float32(0.0) mode = numpy.int32(1) output_height, output_width = image_height*2, image_width*2 image, image_height, image_width = self.image_reshape(image,output_height,output_width,image_height,image_width) image2, image2_height, image2_width = self.image_reshape(image2,output_height,output_width,image2.shape[1],image2.shape[0]) print "Image : (%s, %s) -- Output: (%s, %s)" %(image_height, image_width , output_height, output_width) #perform correction by least square sol, MSE = self.matching_correction(image,image2) print sol correction_matrix = numpy.zeros((2,2),dtype=numpy.float32) correction_matrix[0] = sol[0:2,0] correction_matrix[1] = sol[3:5,0] matrix_for_gpu = correction_matrix.reshape(4,1) #for float4 struct offset_value[0] = sol[2,0] offset_value[1] = sol[5,0] wg = 8,8 shape = calc_size((output_width,output_height), wg) gpu_image = pyopencl.array.to_device(queue, image2) gpu_output = pyopencl.array.empty(queue, (output_height, output_width), dtype=numpy.float32, order="C") gpu_matrix = pyopencl.array.to_device(queue,matrix_for_gpu) gpu_offset = pyopencl.array.to_device(queue,offset_value) image_height, image_width = numpy.int32((image_height, image_width)) output_height, output_width = numpy.int32((output_height, output_width)) t0 = time.time() k1 = self.program.transform(queue, shape, wg, gpu_image.data, gpu_output.data, gpu_matrix.data, gpu_offset.data, image_width, image_height, output_width, output_height, fill_value, mode) res = gpu_output.get() t1 = time.time() # print res[0,0] ref = scipy.ndimage.interpolation.affine_transform(image2,correction_matrix, offset=offset_value, output_shape=(output_height,output_width),order=1, mode="constant", cval=fill_value) t2 = time.time() delta = abs(res-image) delta_arg = delta.argmax() delta_max = delta.max() # delta_mse_res = ((res-image)**2).sum()/image.size # delta_mse_ref = ((ref-image)**2).sum()/image.size at_0, at_1 = delta_arg/output_width, delta_arg%output_width print("Max error: %f at (%d, %d)" %(delta_max, at_0, at_1)) # print("Mean Squared Error Res/Original : %f" %(delta_mse_res)) # print("Mean Squared Error Ref/Original: %f" %(delta_mse_ref)) print("minimal MSE according to least squares : %f" %MSE) # print res[at_0,at_1] # print ref[at_0,at_1] SHOW_FIGURES = True if SHOW_FIGURES: fig = pylab.figure() sp1 = fig.add_subplot(221,title="Input image") sp1.imshow(image, interpolation="nearest") sp2 = fig.add_subplot(222,title="Image after deformation") sp2.imshow(image2, interpolation="nearest") sp2 = fig.add_subplot(223,title="Corrected image (OpenCL)") sp2.imshow(res, interpolation="nearest") sp2 = fig.add_subplot(224,title="Corrected image (Scipy)") sp2.imshow(ref, interpolation="nearest") # sp2.imshow(ref, interpolation="nearest") # sp3 = fig.add_subplot(223,title="delta (max = %f)" %delta_max) # sh3 = sp3.imshow(delta[:,:], interpolation="nearest") # cbar = fig.colorbar(sh3) fig.show() raw_input("enter") if PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Transformation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))