def task1(d_image_complex, d_response_complex): ### Task1 ### # Implement a inplace CUDA FFT convolution # Pseduocode: # freq_imag = fft(image) # freq_resp = fft(response) # freq_out = fftimag * fftresp # output = ifft(freq_out) # # Use the cuFFT functions: # - fft_inplace(ary) # - ifft_inplace(ary) # # Call `vmult` which is our elementwise complex multiplication. # Do a inplace operation on `d_image_complex`. # Hints: # - keyword argument 'out' specify the output array # - length of d_image_complex and d_response_complex has the same length. fft_inplace(d_image_complex) fft_inplace(d_response_complex) vmult(d_image_complex, d_response_complex, out=d_image_complex) ifft_inplace(d_image_complex) # At this point, we have applied the filter onto d_image_complex return # Does not return anything
def cuFFT_v2(ary, out=None, stream=0): itype = ary.dtype.type if out is not None: otype = out.dtype.type if otype is np.complex64: fft.fft(np.fft.fftshift(ary.astype(np.complex64)), out=out, stream=stream) out = np.fft.fftshift(out) else: raise Exception('Output must be type numpy.complex64') return out else: if itype is not np.complex64: ary = ary.astype(np.complex64) tmp = np.fft.fftshift(ary) fft.fft_inplace(tmp, stream=stream) ary = np.fft.fftshift(tmp) return ary
def main(): # Build Filter laplacian_pts = """ -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 """.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.face(gray=True).astype(np.float32) print("Image size: %s" % (image.shape,)) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode="same") te = timer() print("CPU: %.2fs" % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print("kernel config: %s x %s" % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) fft_inplace(d_image_complex) fft_inplace(d_response_complex) vmult(d_image_complex, d_response_complex, out=d_image_complex) ifft_inplace(d_image_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print("GPU: %.2fs" % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title("CPU") plt.imshow(cvimage_cpu, cmap=plt.cm.gray) plt.axis("off") plt.subplot(1, 2, 2) plt.title("GPU") plt.imshow(cvimage_gpu, cmap=plt.cm.gray) plt.axis("off") plt.show()
def main(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.face(gray=True).astype(np.float32) print("Image size: %s" % (image.shape, )) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print('CPU: %.2fs' % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) fft_inplace(d_image_complex) fft_inplace(d_response_complex) vmult(d_image_complex, d_response_complex, out=d_image_complex) ifft_inplace(d_image_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print('GPU: %.2fs' % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title('CPU') plt.imshow(cvimage_cpu, cmap=plt.cm.gray) plt.axis('off') plt.subplot(1, 2, 2) plt.title('GPU') plt.imshow(cvimage_gpu, cmap=plt.cm.gray) plt.axis('off') plt.show()