Example #1
0
def main_no_tex(dtype):
    lc_kernel = get_lin_comb_kernel_no_tex((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10,26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            lc_kernel.prepared_call(x._grid, x._block,
                a.gpudata, x.gpudata,
                b.gpudata, y.gpudata,
                z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
Example #2
0
def main_no_tex(dtype):
    lc_kernel = get_lin_comb_kernel_no_tex(
        ((True, dtype, dtype), (True, dtype, dtype)), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata,
                                    b.gpudata, y.gpudata, z.gpudata,
                                    x.mem_size)

        stop.record()
        stop.synchronize()

        print(size, size_exp, stop.time_since(start))
 def run(self):
     print('Starting grid: \n', self.grid_gpu)
     i = 0
     while i < N_ITERS:
         self.diffusion(
             # input
             self.grid_gpu,
             # output
             self.new_grid,
             # random numbers
             self.randoms,
             # x coordinates
             self.random_x_coordinates,
             # y coordinates
             self.random_y_coordinates,
             # grid of n_blocks x n_blocks
             grid=(self.n_blocks, self.n_blocks, 1),
             # block 0f n_threads x n_threads
             block=(self.n_threads, self.n_threads, 1),
         )
         self.grid_gpu, self.new_grid = self.new_grid, self.grid_gpu
         self.randoms = curandom.rand((self.size, self.size))
         self.random_x_coordinates = ((curandom.rand(
             (self.size, self.size))) * self.size).astype(np.int32)
         self.random_y_coordinates = ((curandom.rand(
             (self.size, self.size))) * self.size).astype(np.int32)
         i += 1
         print('\nGrid after iteration {}: \n{}'.format(i, self.grid_gpu))
     print('\nFinal grid: \n', self.grid_gpu)
Example #4
0
def main(dtype):
    from pycuda.elementwise import get_linear_combination_kernel
    lc_kernel, lc_texrefs = get_linear_combination_kernel((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
            b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
            lc_kernel.prepared_call(x._grid, x._block,
                x.gpudata, y.gpudata, z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
Example #5
0
def main(dtype):
    from pycuda.elementwise import get_linear_combination_kernel
    lc_kernel, lc_texrefs = get_linear_combination_kernel(
        ((True, dtype, dtype), (True, dtype, dtype)), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
            b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
            lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata,
                                    z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print(size, size_exp, stop.time_since(start))
    def run(self, size):
        import numpy as np
        from pycuda import curandom

        a = curandom.rand(size, dtype=np.float64)
        b = curandom.rand(size, dtype=np.float64)

        with CUDATimer() as timer:
            self.op(a, b)

        return timer.elapsed_time()
  def run(self, size):
    import numpy as np
    from pycuda import curandom

    a = curandom.rand(size, dtype = np.float64)
    b = curandom.rand(size, dtype = np.float64)

    with CUDATimer() as timer:
      self.op(a, b)

    return timer.elapsed_time()
Example #8
0
def swipe():
    randomNumbers_d = curandom.rand((nData))
    stepNumber = np.int32(0)
    #saveEnergy = np.int32(0)
    tex_spins.set_array(spinsInArray_d)
    isingKernel(stepNumber,
                np.int32(nWidth),
                np.int32(nHeight),
                beta,
                spinsOut_d,
                randomNumbers_d,
                grid=grid2D_ising,
                block=block2D)
    copy2D_dtod(aligned=True)

    stepNumber = np.int32(1)
    #saveEnergy = np.int32(0)
    tex_spins.set_array(spinsInArray_d)
    isingKernel(stepNumber,
                np.int32(nWidth),
                np.int32(nHeight),
                beta,
                spinsOut_d,
                randomNumbers_d,
                grid=grid2D_ising,
                block=block2D)
    copy2D_dtod(aligned=True)
Example #9
0
def test_gpuarray_to_garray():

    x = curnd.rand((3,3), dtype=np.float32)
    x = x + 2
    gx = common.gpu.gpuarray_to_garray(x)

    print "x:"
    print x

    print "gpuarray_to_garray(x):"
    print gx
Example #10
0
def run_benchmark():
    from pycuda.curandom import rand

    powers = numpy.arange(10, 13, 2**(-6))
    sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)]
    bandwidths = []
    times = []

    for size in sizes:

        source = rand((size, size), dtype=numpy.float32)
        target = gpuarray.empty((size, size), dtype=source.dtype)

        start = pycuda.driver.Event()
        stop = pycuda.driver.Event()

        warmup = 2

        for i in range(warmup):
            _transpose(target, source)

        count = 10

        cuda.Context.synchronize()
        start.record()

        for i in range(count):
            _transpose(target, source)

        stop.record()
        stop.synchronize()

        elapsed_seconds = stop.time_since(start) * 1e-3
        mem_bw = source.nbytes / elapsed_seconds * 2 * count

        bandwidths.append(mem_bw)
        times.append(elapsed_seconds)

    slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
    print("Sizes for which bandwidth was low:", slow_sizes)
    print("Ditto, mod 64:", [s % 64 for s in slow_sizes])
    matplotlib.use('Agg')
    import matplotlib
    import matplotlib.pyplot as plt
    plt.xlabel('matrix size')
    plt.ylabel('bandwidth')
    plt.semilogx(sizes, bandwidths)
    plt.savefig("transpose-bw.png")
    plt.clf()
    plt.xlabel('matrix size')
    plt.ylabel('time')
    plt.loglog(sizes, times)
    plt.savefig("transpose-times.png")
Example #11
0
def run_benchmark():
    from pycuda.curandom import rand

    powers = numpy.arange(10, 13, 2**(-6))
    sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)]
    bandwidths = []
    times = []

    for size in sizes:

        source = rand((size, size), dtype=numpy.float32)
        target = gpuarray.empty((size, size), dtype=source.dtype)

        start = pycuda.driver.Event()
        stop = pycuda.driver.Event()

        warmup = 2

        for i in range(warmup):
            _transpose(target, source)

        count = 10

        cuda.Context.synchronize()
        start.record()

        for i in range(count):
            _transpose(target, source)

        stop.record()
        stop.synchronize()

        elapsed_seconds = stop.time_since(start)*1e-3
        mem_bw = source.nbytes / elapsed_seconds * 2 * count

        bandwidths.append(mem_bw)
        times.append(elapsed_seconds)

    slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
    print("Sizes for which bandwidth was low:", slow_sizes)
    print("Ditto, mod 64:", [s % 64 for s in slow_sizes])
    matplotlib.use('Agg')
    import matplotlib
    import matplotlib.pyplot as plt
    plt.xlabel('matrix size')
    plt.ylabel('bandwidth')
    plt.semilogx(sizes, bandwidths)
    plt.savefig("transpose-bw.png")
    plt.clf()
    plt.xlabel('matrix size')
    plt.ylabel('time')
    plt.loglog(sizes, times)
    plt.savefig("transpose-times.png")
Example #12
0
def swipe():
  randomNumbers_d = curandom.rand((nData))
  stepNumber = np.int32(0)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), beta, 
	       spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D )
  copy2D_dtod(aligned=True) 

  stepNumber = np.int32(1)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), beta,
	       spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D )
  copy2D_dtod(aligned=True)
Example #13
0
def run_benchmark():
    from pycuda.curandom import rand

    sizes = []
    bandwidths = []
    times = []
    for i in numpy.arange(10, 13, 2**(-6)):
        size = int(((2**i) // 16) * 16)

        source = rand((size, size), dtype=numpy.float32)
        target = gpuarray.empty((size, size), dtype=source.dtype)

        start = pycuda.driver.Event()
        stop = pycuda.driver.Event()

        warmup = 2

        for i in range(warmup):
            _transpose(target, source)

        count = 10

        cuda.Context.synchronize()
        start.record()

        for i in range(count):
            _transpose(target, source)

        stop.record()
        stop.synchronize()

        elapsed_seconds = stop.time_since(start) * 1e-3
        mem_bw = source.nbytes / elapsed_seconds * 2 * count

        sizes.append(size)
        bandwidths.append(mem_bw)
        times.append(elapsed_seconds)

    slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
    print slow_sizes
    print[s % 64 for s in slow_sizes]
    from matplotlib.pyplot import semilogx, loglog, show, savefig, clf
    semilogx(sizes, bandwidths)
    savefig("transpose-bw.png")
    clf()
    loglog(sizes, times)
    savefig("transpose-times.png")
Example #14
0
def swipe():
  randomNumbers_d = curandom.rand((nData))
  stepNumber = np.int32(0)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  surf_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta, 
	       spinsOut_d, randomNumbers_d, 
	       plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
  #copy3D_dtod() 

  stepNumber = np.int32(1)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  surf_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta,
	       spinsOut_d, randomNumbers_d, 
	       plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
Example #15
0
def check_transpose():
    from pycuda.curandom import rand

    for i in numpy.arange(10, 13, 0.125):
        size = int(((2**i) // 32) * 32)
        print(size)

        source = rand((size, size), dtype=numpy.float32)

        result = transpose(source)

        err = source.get().T - result.get()
        err_norm = la.norm(err)

        source.gpudata.free()
        result.gpudata.free()

        assert err_norm == 0, (size, err_norm)
Example #16
0
def swipe():
  randomNumbers_d = curandom.rand((nData))
  stepNumber = np.int32(0)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  surf_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta, 
	       spinsOut_d, randomNumbers_d, 
	       plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
  #copy3D_dtod() 

  stepNumber = np.int32(1)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  surf_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta,
	       spinsOut_d, randomNumbers_d, 
	       plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
Example #17
0
def check_transpose():
    from pycuda.curandom import rand

    for i in numpy.arange(10, 13, 0.125):
        size = int(((2**i) // 32) * 32)
        print size

        source = rand((size, size), dtype=numpy.float32)

        result = transpose(source)

        err = source.get().T - result.get()
        err_norm = la.norm(err)

        source.gpudata.free()
        result.gpudata.free()

        assert err_norm == 0, (size, err_norm)
Example #18
0
    def initialisation(self, x_init):

        y  = curand.rand(x_init.shape)
        y  -= x_init + 0.5 # this is only a fixx, remove this line if possible
        if self.options.compute_both:
            fx, gx = self.objective.compute_both(x_init)
            fy, gy = self.objective.compute_both(y)
            
            if fx < fy:
                self.x      = x_init
                self.oldx   = y
                self.g      = gx
                self.oldg   = gy
                self.obj    = fx
                self.oldobj = fy
              
            else:
                self.x      = y
                self.oldx   = x_init
                self.g      = gy
                self.oldg   = gx
                self.obj    = fy
                self.oldobj = fx
        else:
            fx = self.objective.compute_obj(x_init)
            fy = self.objective.compute_obj(y)
    
            if fx < fy:
                self.x      = x_init
                self.oldx   = y
                self.g      = self.objective.compute_grad(x_init)
                self.oldg   = self.objective.compute_grad(y)
                self.obj    = fx
                self.oldobj = fy
                
            else:
                self.x      = y
                self.oldx   = x_init
                self.g      = self.objective.compute_grad(y)
                self.oldg   = self.objective.compute_grad(x_init)
                self.obj    = fy
                self.oldobj = fx
Example #19
0
    def initialisation(self, x_init):

        y = curand.rand(x_init.shape)
        y -= x_init + 0.5  # this is only a fixx, remove this line if possible
        if self.options.compute_both:
            fx, gx = self.objective.compute_both(x_init)
            fy, gy = self.objective.compute_both(y)

            if fx < fy:
                self.x = x_init
                self.oldx = y
                self.g = gx
                self.oldg = gy
                self.obj = fx
                self.oldobj = fy

            else:
                self.x = y
                self.oldx = x_init
                self.g = gy
                self.oldg = gx
                self.obj = fy
                self.oldobj = fx
        else:
            fx = self.objective.compute_obj(x_init)
            fy = self.objective.compute_obj(y)

            if fx < fy:
                self.x = x_init
                self.oldx = y
                self.g = self.objective.compute_grad(x_init)
                self.oldg = self.objective.compute_grad(y)
                self.obj = fx
                self.oldobj = fy

            else:
                self.x = y
                self.oldx = x_init
                self.g = self.objective.compute_grad(y)
                self.oldg = self.objective.compute_grad(x_init)
                self.obj = fy
                self.oldobj = fx
Example #20
0
def run_benchmark():
    from pycuda.curandom import rand

    sizes = []
    bandwidths = []
    times = []
    for i in numpy.arange(10, 13, 2**(-6)):
        size = int(((2**i) // 16) * 16)

        source = rand((size, size), dtype=numpy.float32)
        target = gpuarray.empty((size, size), dtype=source.dtype)

        start = pycuda.driver.Event()
        stop = pycuda.driver.Event()

        warmup = 2

	for i in range(warmup):
	    _transpose(target, source)

        count = 10

        cuda.Context.synchronize()
        start.record()

        for i in range(count):
            _transpose(target, source)

        stop.record()
        stop.synchronize()

        elapsed_seconds = stop.time_since(start)*1e-3
        mem_bw = source.nbytes / elapsed_seconds * 2 * count

        sizes.append(size)
        bandwidths.append(mem_bw)
        times.append(elapsed_seconds)

    slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
    print slow_sizes
    print [s % 64 for s in slow_sizes]
Example #21
0
def replot():
  global xMin, xMax, yMin, yMax
  global jMin, jMax, iMin, iMax
  global random_d
  jMin, jMax = animation2D.jMin, animation2D.jMax
  iMin, iMax = animation2D.iMin, animation2D.iMax
  xMin += (xMax-xMin)*(float(jMin)/nWidth)
  xMax -= (xMax-xMin)*(float(nWidth-jMax)/nWidth)
  yMin += (yMax-yMin)*(float(iMin)/nHeight)
  yMax -= (yMax-yMin)*(float(nHeight-iMax)/nHeight)
  print "Reploting: ( {0} , {1} , {2} , {3} )".format(xMin, xMax, yMin, yMax)
  start, end = cuda.Event(), cuda.Event()
  start.record()
  random_d = curandom.rand((nData), dtype=npPrcsn)
  mappingLogisticKernel( np.int32(nWidth), np.int32(nHeight), npPrcsn(xMin), npPrcsn(xMax), npPrcsn(yMin), npPrcsn(yMax), random_d, graphPoints_d, grid=mapGrid, block=mapBlock )
  normalize( graphPoints_d )
  end.record()
  end.synchronize()
  print " Map Calculated in: %f secs\n" %( start.time_till(end)*1e-3)
  animation2D.windowTitle = "ploting [ ( {0} , {1} ), ( {2} , {3} ) ]".format(xMin, xMax, yMin, yMax)
  animation2D.jMin, animation2D.jMax = 10000, -1
  animation2D.iMin, animation2D.iMax = 10000, -1
  maskFunc()
Example #22
0
def run_tests(timer, scale_factor):
    """PyCUDA port of time_test3.pro"""
    #nofileio = True
    
    # Initialize linear algebra extensions to PyCUDA
    scikits.cuda.linalg.init()

    #initialize time
    timer.reset()   

    #
    # khughitt (2011/04/04): Non-CUDA tests from above will go here...
    #
    
    #
    # Begin CUDA tests
    #
    siz = int(384 * math.sqrt(scale_factor))

    # a = curandom.rand((siz,siz), dtype=np.int32)
    a = curandom.rand((siz,siz))

    timer.reset()

    #Test 17 - Transpose byte array, TRANSPOSE function
    for i in range(100):
        b = scikits.cuda.linalg.transpose(a, pycuda.autoinit.device)
    timer.log('Transpose %d^2 byte, TRANSPOSE function x 100' % siz)
    
    n = 2**(17 * scale_factor)
    a  = gpuarray.arange(n, dtype=np.float32)
    timer.reset()
    
    #Test 20 - Forward and inverse FFT
    b = scikits.cuda.fft.fft(a)
    b = scikits.cuda.fft.ifft(b)
    timer.log('%d point forward plus inverse FFT' % n)
def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []
    
    for power in range(10, 25): # 24
        size = 1<<power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)

        if power > 20:
            count = 100
        else:
            count = 1000

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cuda operation which fills the array with random numbers
        for i in range(count):
            curandom.rand((size, ))
            
        #stop timer
        end.record()
        end.synchronize()
        
        #calculate used time
        secs = start.time_till(end)*1e-3

        times.append(secs/count)
        flops.append(size)

        #cpu operations which fills teh array with random data
        a = numpy.array((size,), dtype=numpy.float32)

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cpu operation which fills the array with random data        
        for i in range(count):
            numpy.random.rand(size).astype(numpy.float32)

        #stop timer
        end.record()
        end.synchronize()
        
        #calculate used time
        secs = start.time_till(end)*1e-3

        #add results to variable
        timesCPU.append(secs/count)
        flopsCPU.append(size)
            
            
    #calculate pseudo flops
    flops = [f/t for f, t in zip(flops,times)]
    flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)]

    #print the data out
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
    for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
        tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu))
    print tbl
Example #24
0
                    dt = dt1
            del gpyfft_plan
        gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024**3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [gpyfft[clFFT]]  [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        results["gpyfft[clFFT]"].append(gbps)
        results["gpyfft[clFFT]-dt"].append(dt)

    if has_pyvkfft_opencl or has_gpyfft:
        d.data.release()
        del d
        gc.collect()

    # CUDA backends
    if has_pyvkfft_cuda or has_pyvkfft_cuda:
        d = curandom.rand(shape=sh, dtype=np.float32).astype(dtype)

    if has_pyvkfft_cuda:
        dt = 0
        try:
            app = cuVkFFTApp(d.shape, d.dtype, ndim=ndim)
            for i in range(nb_repeat):
                cu_ctx.synchronize()
                t0 = timeit.default_timer()
                for i in range(nb):
                    d = app.ifft(d)
                    d = app.fft(d)
                cu_ctx.synchronize()
                dt1 = timeit.default_timer() - t0
                if dt == 0:
                    dt = dt1
Example #25
0
    def initialize_kernel(self):
        self.kernel_code = """

			// Ignore edge rows and columns
			// Assuming the matrix is large, the effect of this is small
			__global__ void diffuse(float* grid, float* new_grid, float* randoms)
			{{

				unsigned int grid_size = {};
				float prob = {};

				unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;			// column element of index
				unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;			// row element of index
				unsigned int thread_id = y * grid_size + x; 					// thread index in array

				unsigned int edge = (x == 0) || (x == grid_size - 1) || (y == 0) || (y == grid_size - 1);

				if (grid[thread_id] == 1) {{
					new_grid[thread_id] = 1;									// current cell
					if (!edge) {{
						/*
						if (randoms[thread_id - grid_size] < prob) {{
							new_grid[thread_id - grid_size] = 1;				// above
						}}
						if (randoms[thread_id - grid_size - 1] < prob) {{
							new_grid[thread_id - grid_size - 1] = 1;			// above and left
						}}
						if (randoms[thread_id - grid_size + 1] < prob) {{
							new_grid[thread_id - grid_size + 1] = 1;			// above and right
						}}
						if (randoms[thread_id + grid_size] < prob) {{
							new_grid[thread_id + grid_size] = 1;				// below
						}}
						*/
						if (randoms[thread_id + grid_size - 1] < prob) {{
							new_grid[thread_id + grid_size - 1] = 1;			// below and left
						}}
						if (randoms[thread_id + grid_size + 1] < prob) {{
							new_grid[thread_id + grid_size + 1] = 1;			// below and right
						}}
						if (randoms[thread_id - 1] < prob) {{
							new_grid[thread_id - 1] = 1;						// left
						}}
						if (randoms[thread_id + 1] < prob) {{
							new_grid[thread_id + 1] = 1;						// right
						}}
					}}
				}}
			}}
		"""

        # Transfer CPU memory to GPU memory
        self.grid_gpu = gpuarray.to_gpu(self.grid)
        self.new_grid = gpuarray.empty((self.size, self.size), np.float32)

        self.kernel = self.kernel_code.format(self.size, self.prob)

        # Compile kernel code
        self.mod = SourceModule(self.kernel)

        # Get kernel function from compiled module
        self.diffusion = self.mod.get_function('diffuse')

        # random numbers indicating probabilty of diffusion to a given cell
        self.randoms = curandom.rand((self.size, self.size))
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import time

import pycuda.gpuarray as gpuarray
import pycuda.curandom as curandom

n = 16 * 1024 * 1204
U1 = curandom.rand(n)
U2 = curandom.rand(n)
counter = gpuarray.zeros(n, dtype='f')

start_time = time.time()

counter = gpuarray.sum((U1 * U1 + U2 * U2) <= 1.0)

print "PI_gpu = ", 4.0 * counter / n
print "Time elapsed GPUArrays: ", time.time() - start_time, "s"

# Sequential part

U1 = numpy.random.rand(n).astype('f')
U2 = numpy.random.rand(n).astype('f')

start_time = time.time()

counter_cpu = numpy.sum((numpy.power(U1, 2) + numpy.power(U2, 2)) <= 1.0)

print "PI_cpu = ", 4.0 * counter_cpu / n
print "Time elapsed CPU: ", time.time() - start_time, "s"
Example #27
0
    def initialize_kernel(self):
        self.kernel_code = """

			#include <stdlib.h>
			#include <math.h>

			// Ignore edge rows and columns
			__global__ void diffuse(float* grid, float* new_grid, float* randoms, int* x_coords, int* y_coords)
			{{

				unsigned int grid_size = {};
				float prob = {};

				unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;				// column element of index
				unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;				// row element of index
				unsigned int thread_id = y * grid_size + x; 						// thread index in array

				if (grid[thread_id] == 1) {{
					new_grid[thread_id] = 1;										// current cell
					if (randoms[thread_id] < prob) {{

						// row and col before distance decay
						unsigned int random_x = x_coords[thread_id];
						unsigned int random_y = y_coords[thread_id];

						float diff = prob - randoms[thread_id];

						// distance decay occuring in x and y directions
						// amount of decay dictated by random coordinate, diffusion threshold, and random value
						float decay_x = floor(abs(((float)random_x - x) / prob * diff));
						float decay_y = floor(abs(((float)random_y - y) / prob * diff));

						// apply decay in appropriate direction
						unsigned int spread_x = random_x;
						if (random_x > x) {{
							spread_x -= decay_x;
						}}
						else if (random_x < x) {{
							spread_x += decay_x;
						}}

						// apply decay in appropriate direction
						unsigned int spread_y = random_y;
						if (random_y > y) {{
							spread_y -= decay_y;
						}}
						else if (random_y < y) {{
							spread_y += decay_y;
						}}

						/*
						printf("Initial y: %u\\t"
							"Inintial x: %u\\t"
							"Random y: %u\\t"
							"Random x: %u\\t"
							"Y decay: %f\\t"
							"Decay x: %f\\t"
							"New y: %u\\t"
							"New x: %u\\n",
							y, x, random_y, random_x, decay_y, decay_x, spread_y, spread_x);
						*/

						unsigned int spread_index = spread_y * grid_size + spread_x;
						new_grid[spread_index] = 1;
					}}
				}}
			}}
		"""

        # Transfer CPU memory to GPU memory
        self.grid_gpu = gpuarray.to_gpu(self.grid)
        self.new_grid = gpuarray.empty((self.size, self.size), np.float32)

        self.kernel = self.kernel_code.format(self.size, self.prob)

        # Compile kernel code
        self.mod = SourceModule(self.kernel)

        # Get kernel function from compiled module
        self.diffusion = self.mod.get_function('diffuse')

        # random numbers indicating probabilty of diffusion to a given cell
        self.randoms = curandom.rand((self.size, self.size))
        self.random_x_coordinates = ((curandom.rand(
            (self.size, self.size))) * self.size).astype(np.int32)
        self.random_y_coordinates = ((curandom.rand(
            (self.size, self.size))) * self.size).astype(np.int32)
Example #28
0
def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []

    for power in range(10, 25):  # 24
        size = 1 << power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size, ), dtype=numpy.float32)

        if power > 20:
            count = 100
        else:
            count = 1000

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cuda operation which fills the array with random numbers
        for i in range(count):
            curandom.rand((size, ))

        #stop timer
        end.record()
        end.synchronize()

        #calculate used time
        secs = start.time_till(end) * 1e-3

        times.append(secs / count)
        flops.append(size)

        #cpu operations which fills teh array with random data
        a = numpy.array((size, ), dtype=numpy.float32)

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cpu operation which fills the array with random data
        for i in range(count):
            numpy.random.rand(size).astype(numpy.float32)

        #stop timer
        end.record()
        end.synchronize()

        #calculate used time
        secs = start.time_till(end) * 1e-3

        #add results to variable
        timesCPU.append(secs / count)
        flopsCPU.append(size)

    #calculate pseudo flops
    flops = [f / t for f, t in zip(flops, times)]
    flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)]

    #print the data out
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU",
                 "Size/Time CPU", "GPU vs CPU speedup"))
    for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU):
        tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu))
    print tbl
Example #29
0
cudaCodeStringRaw = cudaCodeFile.read() 
cudaCodeString = (cudaCodeStringRaw %{"HEIGHT":mapBlock[0], "B_HEIGHT":block2D[1], "B_WIDTH":block2D[0] }).replace("cudaP", precision)
cudaCode = SourceModule(cudaCodeString)
mappingLogisticKernel = cudaCode.get_function('mappingLogistic_kernel')
maskKernel = cudaCode.get_function('mask_kernel')
plotKernel = cudaCode.get_function('plot_kernel')
########################################################################
from pycuda.elementwise import ElementwiseKernel
########################################################################
linearDouble = ElementwiseKernel(arguments="cudaP a, cudaP b, cudaP *input, cudaP *output".replace( 'cudaP', precision),
				operation = "output[i] = a*input[i] + b ")

#Initialize all gpu data
print "Initializing Data"
initialMemory = getFreeMemory( show=True )  
random_d = curandom.rand((nData), dtype=npPrcsn) 
graphPoints_d= gpuarray.to_gpu( np.zeros([nData], dtype=npPrcsn) ) 	
#For plotting
maskPoints_h = np.ones(nData).astype(np.int32)
maskPoints_d = gpuarray.to_gpu( maskPoints_h )
plotData_d = gpuarray.to_gpu( np.zeros([nData], dtype=npPrcsn) )
finalMemory = getFreeMemory( show=False )
print " Total Global Memory Used: {0} Mbytes".format(float(initialMemory-finalMemory)/1e6) 

def replot():
  global xMin, xMax, yMin, yMax
  global jMin, jMax, iMin, iMax
  global random_d
  jMin, jMax = animation2D.jMin, animation2D.jMax
  iMin, iMax = animation2D.iMin, animation2D.iMax
  xMin += (xMax-xMin)*(float(jMin)/nWidth)
Example #30
0
    def initialize_kernel(self):
        self.kernel_code = """

			// Ignore edge rows and columns
			__global__ void local_diffuse(float* grid, float* new_grid, float* randoms)
			{{

				unsigned int grid_size = {};
				float prob = {};

				unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;			// column element of index
				unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;			// row element of index
				unsigned int thread_id = y * grid_size + x; 					// thread index in array

				unsigned int edge = (x == 0) || (x == grid_size - 1) || (y == 0) || (y == grid_size - 1);

				if (grid[thread_id] == 1) {{
					new_grid[thread_id] = 1;									// current cell
					if (!edge) {{
						if (randoms[thread_id - grid_size] < prob) {{
							new_grid[thread_id - grid_size] = 1;				// above
						}}
						if (randoms[thread_id - grid_size - 1] < prob) {{
							new_grid[thread_id - grid_size - 1] = 1;			// above and left
						}}
						if (randoms[thread_id - grid_size + 1] < prob) {{
							new_grid[thread_id - grid_size + 1] = 1;			// above and right
						}}
						if (randoms[thread_id + grid_size] < prob) {{
							new_grid[thread_id + grid_size] = 1;				// below
						}}
						if (randoms[thread_id + grid_size - 1] < prob) {{
							new_grid[thread_id + grid_size - 1] = 1;			// below and left
						}}
						if (randoms[thread_id + grid_size + 1] < prob) {{
							new_grid[thread_id + grid_size + 1] = 1;			// below and right
						}}
						if (randoms[thread_id - 1] < prob) {{
							new_grid[thread_id - 1] = 1;						// left
						}}
						if (randoms[thread_id + 1] < prob) {{
							new_grid[thread_id + 1] = 1;						// right
						}}
					}}
				}}
			}}

			// Ignore edge rows and columns
			__global__ void non_local_diffuse(float* grid, float* new_grid, float* randoms, int* x_coords, int* y_coords)
			{{

				unsigned int grid_size = {};
				float prob = {};

				unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;			// column element of index
				unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;			// row element of index
				unsigned int thread_id = y * grid_size + x; 					// thread index in array

				if (grid[thread_id] == 1) {{
					new_grid[thread_id] = 1;									// current cell
					if (randoms[thread_id] < prob) {{
						unsigned int spread_index = y_coords[thread_id] * grid_size + x_coords[thread_id];
						new_grid[spread_index] = 1;
					}}
				}}
			}}
		"""

        # Below this will be in split
        # Split transfers data to GPU memory
        # Random not part of split

        # grid_a = initialize_grid(MATRIX_SIZE, BLOCK_SIZE, P_LOCAL, P_NON_LOCAL)
        # grid_b = empty grid
        # grid_a, grid_b <
        # local_diffuse(grid_a, grid_b)

        # Transfer CPU memory to GPU memory
        self.grid_gpu = gpuarray.to_gpu(self.grid)
        self.new_grid = gpuarray.empty((self.size, self.size), np.float32)

        self.kernel = self.kernel_code.format(self.size, self.p_local,
                                              self.size, self.p_non_local)

        # Compile kernel code
        self.mod = SourceModule(self.kernel)

        self.local_diffusion = self.mod.get_function('local_diffuse')
        self.non_local_diffusion = self.mod.get_function('non_local_diffuse')

        self.randoms = curandom.rand((self.size, self.size))
        self.random_x_coordinates = ((curandom.rand(
            (self.size, self.size))) * self.size).astype(np.int32)
        self.random_y_coordinates = ((curandom.rand(
            (self.size, self.size))) * self.size).astype(np.int32)
Example #31
0
'''
@Project: deep-learning-with-keras-notebooks
@Package 
@author: ly
@date Date: 2020年01月02日 17:21
@Description: 
@URL: https://wiki.tiker.net/PyCuda/Examples/PlotRandomData
@version: V1.0
'''
import pycuda.autoinit
import pycuda.curandom as curandom

size = 5000
a = curandom.rand((size, )).get()

from matplotlib.pyplot import *
subplot(211)
plot(a)
grid(True)
ylabel('plot - gpu')

subplot(212)
hist(a, 100)
grid(True)
ylabel('Histogram - gpu')

show()
Example #32
0
# simple module to show the plotting of random data

import pycuda.autoinit
import pycuda.curandom as curandom

size = 1000
a = curandom.rand((size,)).get()

from matplotlib.pylab import *

subplot(211)
plot(a)
grid(True)
ylabel("plot - gpu")

subplot(212)
hist(a, 100)
grid(True)
ylabel("histogram - gpu")

# and save it
savefig("plot-random-data")
Example #33
0
        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get()
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return

        # No condition met, so return false
        self.term_reason = 0

if __name__ == '__main__':

    case = 2
    if case == 1:
        A = curand.rand((10000, 1000))
        xt = curand.rand((1000, 1))
        b = cua.dot(A, xt)

        x_init = cua.empty_like(xt)
        x_init.fill(0.1)

        # Set up objective
        objective = MVM_Objective(A, b)

        # Default optimization options
        opt = Solopt()

        pbb = PBB(objective, x_init, opt)

    elif case == 2:
def random_normal(loc=0.0, scale=1.0, size=None):
    u1 = curandom.rand(size, dtype=numpy.float64)
    u2 = curandom.rand(size, dtype=numpy.float64)
    z1 = cumath.sqrt(-2.*cumath.log(u1))*cumath.cos(2.*numpy.pi*u2)
    return CUDAArray(scale*z1+loc)
Example #35
0
        if temp > 0.1: temp -= 0.1
    beta = np.float32(1. / temp)
    animation2D.windowTitle = "Ising Model 2D  spins={0}x{1}   T={2:.1f}".format(
        nHeight, nWidth, float(temp))


########################################################################
########################################################################
#Initialize all gpu data
print "\nInitializing Data"
initialMemory = getFreeMemory(show=True)
#Set initial random distribution
spins_h = (2 * np.random.random_integers(0, 1, [nHeight, nWidth]) - 1).astype(
    np.int32)
spinsOut_d = gpuarray.to_gpu(spins_h)
randomNumbers_d = curandom.rand((nData))
#For texture version
spinsInArray_d, copy2D_dtod = gpuArray2DtocudaArray(spinsOut_d)
#For shared version
finalMemory = getFreeMemory(show=False)
print " Total Global Memory Used: {0} Mbytes\n".format(
    float(initialMemory - finalMemory) / 1e6)
########################################################################
########################################################################

#configure animation2D functions and plotData
animation2D.stepFunc = stepFunction
animation2D.specialKeys = specialKeyboardFunc
animation2D.plotData_d = spinsOut_d
animation2D.maxVar = np.float32(2)
animation2D.minVar = np.float32(-20)
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import pycuda.curandom as curandom
a = curandom.rand((5,3))
print('a:\n{0}\n{1}\n{2}\n'.format(a, a.dtype, type(a)))
b = curandom.seed_getter_uniform(5)
print('seed_getter_uniform:\n{0}\n{1}\n'.format(b, b.dtype))
c = curandom.seed_getter_unique(5)
print('seed_getter_unique:\n{0}\n{1}\n'.format(c, c.dtype))
generator = curandom.XORWOWRandomNumberGenerator(curandom.seed_getter_unique, 1000
d = gpuarray.empty((5,3), dtype = 'float32')
generator.fill_uniform(d)
print('d:\n{0}\n{1}\n{2}\n'.format(d, d.dtype, type(d)))
e = generator.gen_uniform((5,3), dtype = 'float32')
print('e:\n{0}\n{1}\n{2}\n'.format(e, e.dtype, type(e)))



import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import time

import pycuda.gpuarray as gpuarray
import pycuda.curandom as curandom


n = 16*1024*1204
U1 = curandom.rand(n)
U2 = curandom.rand(n)
counter = gpuarray.zeros(n, dtype='f')

start_time = time.time()

counter = gpuarray.sum( (U1*U1 + U2*U2) <= 1.0 )

print "PI_gpu = ", 4.0*counter/n
print "Time elapsed GPUArrays: ", time.time() - start_time, "s"

# Sequential part

U1 = numpy.random.rand(n).astype('f')
U2 = numpy.random.rand(n).astype('f')

start_time = time.time()

counter_cpu = numpy.sum( (numpy.power(U1,2) + numpy.power(U2,2)) <= 1.0 )

print "PI_cpu = ", 4.0*counter_cpu/n
Example #38
0
  if key== animation2D.GLUT_KEY_UP:
    temp += 0.1
  if key== animation2D.GLUT_KEY_DOWN:
    if temp > 0.1: temp -= 0.1
  beta = np.float32(1./temp)
  animation2D.windowTitle = "Ising Model 2D  spins={0}x{1}   T={2:.1f}".format(nHeight, nWidth, float(temp))
  
########################################################################
########################################################################
#Initialize all gpu data
print "\nInitializing Data"
initialMemory = getFreeMemory( show=True )  
#Set initial random distribution
spins_h = (2*np.random.random_integers(0,1,[nHeight, nWidth]) - 1 ).astype(np.int32)
spinsOut_d = gpuarray.to_gpu( spins_h )
randomNumbers_d = curandom.rand((nData))
#For texture version
spinsInArray_d, copy2D_dtod = gpuArray2DtocudaArray( spinsOut_d )
#For shared version
finalMemory = getFreeMemory( show=False )
print " Total Global Memory Used: {0} Mbytes\n".format(float(initialMemory-finalMemory)/1e6) 
########################################################################
########################################################################


#configure animation2D functions and plotData
animation2D.stepFunc = stepFunction
animation2D.specialKeys = specialKeyboardFunc
animation2D.plotData_d = spinsOut_d
animation2D.maxVar = np.float32(2)
animation2D.minVar = np.float32(-20)
import pycuda.autoinit
import pycuda.curandom as curandom
import matplotlib.pyplot as plt
import numpy as np

N = 1000
# --- Generating random data directly on the gpu
d_a = curandom.rand(N, dtype = np.float32, stream = None)

plt.plot(d_a.get())
plt.xlabel('Realization index')
plt.ylabel('Random numbers')

save_file = 0
if save_file:
    plt.savefig('test')
else:
    plt.show()
Example #40
0
         
        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get();
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return
         
        # No condition met, so return false
        self.term_reason = 0;        

if __name__ == '__main__':

    case = 2
    if case == 1:
        A  = curand.rand((10000,1000))
        xt = curand.rand((1000,1))
        b  = cua.dot(A, xt)
         
        x_init = cua.empty_like(xt)
        x_init.fill(0.1)
         
        # Set up objective
        objective = MVM_Objective(A,b)
         
        # Default optimization options
        opt = Solopt()
         
        pbb = PBB(objective, x_init, opt); 

    elif case == 2: