コード例 #1
0
ファイル: GpuScalarMult.py プロジェクト: sluo1989/computing
def main(dtype):
    from pycuda.elementwise import get_linear_combination_kernel
    lc_kernel, lc_texrefs = get_linear_combination_kernel((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
            b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
            lc_kernel.prepared_call(x._grid, x._block,
                x.gpudata, y.gpudata, z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
コード例 #2
0
ファイル: GpuScalarMult.py プロジェクト: sluo1989/computing
def main_no_tex(dtype):
    lc_kernel = get_lin_comb_kernel_no_tex((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10,26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            lc_kernel.prepared_call(x._grid, x._block,
                a.gpudata, x.gpudata,
                b.gpudata, y.gpudata,
                z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
コード例 #3
0
  def run(self, size):
    import numpy as np
    from pycuda import curandom

    a = curandom.rand(size, dtype = np.float64)
    b = curandom.rand(size, dtype = np.float64)

    with CUDATimer() as timer:
      self.op(a, b)

    return timer.elapsed_time()
コード例 #4
0
ファイル: test_common_gpu.py プロジェクト: surban/ml
def test_gpuarray_to_garray():

    x = curnd.rand((3,3), dtype=np.float32)
    x = x + 2
    gx = common.gpu.gpuarray_to_garray(x)

    print "x:"
    print x

    print "gpuarray_to_garray(x):"
    print gx
コード例 #5
0
ファイル: mmult.py プロジェクト: brandonbm00/rapidpe_gpu
def run_benchmark():
    from pycuda.curandom import rand

    powers = numpy.arange(10, 13, 2**(-6))
    sizes = [int(size) for size in numpy.unique(2**powers // 16 * 16)]
    bandwidths = []
    times = []

    for size in sizes:

        source = rand((size, size), dtype=numpy.float32)
        target = gpuarray.empty((size, size), dtype=source.dtype)

        start = pycuda.driver.Event()
        stop = pycuda.driver.Event()

        warmup = 2

        for i in range(warmup):
            _transpose(target, source)

        count = 10

        cuda.Context.synchronize()
        start.record()

        for i in range(count):
            _transpose(target, source)

        stop.record()
        stop.synchronize()

        elapsed_seconds = stop.time_since(start)*1e-3
        mem_bw = source.nbytes / elapsed_seconds * 2 * count

        bandwidths.append(mem_bw)
        times.append(elapsed_seconds)

    slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
    print("Sizes for which bandwidth was low:", slow_sizes)
    print("Ditto, mod 64:", [s % 64 for s in slow_sizes])
    matplotlib.use('Agg')
    import matplotlib
    import matplotlib.pyplot as plt
    plt.xlabel('matrix size')
    plt.ylabel('bandwidth')
    plt.semilogx(sizes, bandwidths)
    plt.savefig("transpose-bw.png")
    plt.clf()
    plt.xlabel('matrix size')
    plt.ylabel('time')
    plt.loglog(sizes, times)
    plt.savefig("transpose-times.png")
コード例 #6
0
ファイル: ising2D.py プロジェクト: bvillasen/isingModel
def swipe():
  randomNumbers_d = curandom.rand((nData))
  stepNumber = np.int32(0)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), beta, 
	       spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D )
  copy2D_dtod(aligned=True) 

  stepNumber = np.int32(1)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), beta,
	       spinsOut_d, randomNumbers_d, grid=grid2D_ising, block=block2D )
  copy2D_dtod(aligned=True)
コード例 #7
0
ファイル: MatrixTranspose.py プロジェクト: bbkiwi/SpyderWork
def check_transpose():
    from pycuda.curandom import rand

    for i in numpy.arange(10, 13, 0.125):
        size = int(((2**i) // 32) * 32)
        print size

        source = rand((size, size), dtype=numpy.float32)

        result = transpose(source)

        err = source.get().T - result.get()
        err_norm = la.norm(err)

        source.gpudata.free()
        result.gpudata.free()

        assert err_norm == 0, (size, err_norm)
コード例 #8
0
ファイル: ising3D.py プロジェクト: bvillasen/isingModel
def swipe():
  randomNumbers_d = curandom.rand((nData))
  stepNumber = np.int32(0)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  surf_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta, 
	       spinsOut_d, randomNumbers_d, 
	       plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
  #copy3D_dtod() 

  stepNumber = np.int32(1)
  #saveEnergy = np.int32(0)
  tex_spins.set_array( spinsInArray_d )
  surf_spins.set_array( spinsInArray_d )
  isingKernel( stepNumber, np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), beta,
	       spinsOut_d, randomNumbers_d, 
	       plotDataFloat_d, np.float32(upVal), np.float32(downVal), grid=grid3D_ising, block=block3D )
コード例 #9
0
    def initialisation(self, x_init):

        y  = curand.rand(x_init.shape)
        y  -= x_init + 0.5 # this is only a fixx, remove this line if possible
        if self.options.compute_both:
            fx, gx = self.objective.compute_both(x_init)
            fy, gy = self.objective.compute_both(y)
            
            if fx < fy:
                self.x      = x_init
                self.oldx   = y
                self.g      = gx
                self.oldg   = gy
                self.obj    = fx
                self.oldobj = fy
              
            else:
                self.x      = y
                self.oldx   = x_init
                self.g      = gy
                self.oldg   = gx
                self.obj    = fy
                self.oldobj = fx
        else:
            fx = self.objective.compute_obj(x_init)
            fy = self.objective.compute_obj(y)
    
            if fx < fy:
                self.x      = x_init
                self.oldx   = y
                self.g      = self.objective.compute_grad(x_init)
                self.oldg   = self.objective.compute_grad(y)
                self.obj    = fx
                self.oldobj = fy
                
            else:
                self.x      = y
                self.oldx   = x_init
                self.g      = self.objective.compute_grad(y)
                self.oldg   = self.objective.compute_grad(x_init)
                self.obj    = fy
                self.oldobj = fx
コード例 #10
0
ファイル: MatrixTranspose.py プロジェクト: bbkiwi/SpyderWork
def run_benchmark():
    from pycuda.curandom import rand

    sizes = []
    bandwidths = []
    times = []
    for i in numpy.arange(10, 13, 2**(-6)):
        size = int(((2**i) // 16) * 16)

        source = rand((size, size), dtype=numpy.float32)
        target = gpuarray.empty((size, size), dtype=source.dtype)

        start = pycuda.driver.Event()
        stop = pycuda.driver.Event()

        warmup = 2

	for i in range(warmup):
	    _transpose(target, source)

        count = 10

        cuda.Context.synchronize()
        start.record()

        for i in range(count):
            _transpose(target, source)

        stop.record()
        stop.synchronize()

        elapsed_seconds = stop.time_since(start)*1e-3
        mem_bw = source.nbytes / elapsed_seconds * 2 * count

        sizes.append(size)
        bandwidths.append(mem_bw)
        times.append(elapsed_seconds)

    slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9]
    print slow_sizes
    print [s % 64 for s in slow_sizes]
コード例 #11
0
ファイル: chaosGlass.py プロジェクト: bvillasen/chaosGlass
def replot():
  global xMin, xMax, yMin, yMax
  global jMin, jMax, iMin, iMax
  global random_d
  jMin, jMax = animation2D.jMin, animation2D.jMax
  iMin, iMax = animation2D.iMin, animation2D.iMax
  xMin += (xMax-xMin)*(float(jMin)/nWidth)
  xMax -= (xMax-xMin)*(float(nWidth-jMax)/nWidth)
  yMin += (yMax-yMin)*(float(iMin)/nHeight)
  yMax -= (yMax-yMin)*(float(nHeight-iMax)/nHeight)
  print "Reploting: ( {0} , {1} , {2} , {3} )".format(xMin, xMax, yMin, yMax)
  start, end = cuda.Event(), cuda.Event()
  start.record()
  random_d = curandom.rand((nData), dtype=npPrcsn)
  mappingLogisticKernel( np.int32(nWidth), np.int32(nHeight), npPrcsn(xMin), npPrcsn(xMax), npPrcsn(yMin), npPrcsn(yMax), random_d, graphPoints_d, grid=mapGrid, block=mapBlock )
  normalize( graphPoints_d )
  end.record()
  end.synchronize()
  print " Map Calculated in: %f secs\n" %( start.time_till(end)*1e-3)
  animation2D.windowTitle = "ploting [ ( {0} , {1} ), ( {2} , {3} ) ]".format(xMin, xMax, yMin, yMax)
  animation2D.jMin, animation2D.jMax = 10000, -1
  animation2D.iMin, animation2D.iMax = 10000, -1
  maskFunc()
コード例 #12
0
def random_normal(loc=0.0, scale=1.0, size=None):
    u1 = curandom.rand(size, dtype=numpy.float64)
    u2 = curandom.rand(size, dtype=numpy.float64)
    z1 = cumath.sqrt(-2.*cumath.log(u1))*cumath.cos(2.*numpy.pi*u2)
    return CUDAArray(scale*z1+loc)
コード例 #13
0
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import time

import pycuda.gpuarray as gpuarray
import pycuda.curandom as curandom


n = 16*1024*1204
U1 = curandom.rand(n)
U2 = curandom.rand(n)
counter = gpuarray.zeros(n, dtype='f')

start_time = time.time()

counter = gpuarray.sum( (U1*U1 + U2*U2) <= 1.0 )

print "PI_gpu = ", 4.0*counter/n
print "Time elapsed GPUArrays: ", time.time() - start_time, "s"

# Sequential part

U1 = numpy.random.rand(n).astype('f')
U2 = numpy.random.rand(n).astype('f')

start_time = time.time()

counter_cpu = numpy.sum( (numpy.power(U1,2) + numpy.power(U2,2)) <= 1.0 )

print "PI_cpu = ", 4.0*counter_cpu/n
コード例 #14
0
         
        # Gradient check
        if (self.options.use_tolg):
            nr = cua.max(cua.fabs(self.grad)).get();
            if (nr < self.options.tolg):
                self.term_reason = '|| grad ||_inf < opt.tolg'
                return
         
        # No condition met, so return false
        self.term_reason = 0;        

if __name__ == '__main__':

    case = 2
    if case == 1:
        A  = curand.rand((10000,1000))
        xt = curand.rand((1000,1))
        b  = cua.dot(A, xt)
         
        x_init = cua.empty_like(xt)
        x_init.fill(0.1)
         
        # Set up objective
        objective = MVM_Objective(A,b)
         
        # Default optimization options
        opt = Solopt()
         
        pbb = PBB(objective, x_init, opt); 

    elif case == 2:
コード例 #15
0
ファイル: PlotRandomData.py プロジェクト: bbkiwi/SpyderWork
# simple module to show the plotting of random data

import pycuda.autoinit
import pycuda.curandom as curandom

size = 1000
a = curandom.rand((size,)).get()

from matplotlib.pylab import *

subplot(211)
plot(a)
grid(True)
ylabel("plot - gpu")

subplot(212)
hist(a, 100)
grid(True)
ylabel("histogram - gpu")

# and save it
savefig("plot-random-data")
コード例 #16
0
def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []
    
    for power in range(10, 25): # 24
        size = 1<<power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)

        if power > 20:
            count = 100
        else:
            count = 1000

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cuda operation which fills the array with random numbers
        for i in range(count):
            curandom.rand((size, ))
            
        #stop timer
        end.record()
        end.synchronize()
        
        #calculate used time
        secs = start.time_till(end)*1e-3

        times.append(secs/count)
        flops.append(size)

        #cpu operations which fills teh array with random data
        a = numpy.array((size,), dtype=numpy.float32)

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cpu operation which fills the array with random data        
        for i in range(count):
            numpy.random.rand(size).astype(numpy.float32)

        #stop timer
        end.record()
        end.synchronize()
        
        #calculate used time
        secs = start.time_till(end)*1e-3

        #add results to variable
        timesCPU.append(secs/count)
        flopsCPU.append(size)
            
            
    #calculate pseudo flops
    flops = [f/t for f, t in zip(flops,times)]
    flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)]

    #print the data out
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
    for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
        tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu))
    print tbl
コード例 #17
0
ファイル: ising2D.py プロジェクト: bvillasen/isingModel
  if key== animation2D.GLUT_KEY_UP:
    temp += 0.1
  if key== animation2D.GLUT_KEY_DOWN:
    if temp > 0.1: temp -= 0.1
  beta = np.float32(1./temp)
  animation2D.windowTitle = "Ising Model 2D  spins={0}x{1}   T={2:.1f}".format(nHeight, nWidth, float(temp))
  
########################################################################
########################################################################
#Initialize all gpu data
print "\nInitializing Data"
initialMemory = getFreeMemory( show=True )  
#Set initial random distribution
spins_h = (2*np.random.random_integers(0,1,[nHeight, nWidth]) - 1 ).astype(np.int32)
spinsOut_d = gpuarray.to_gpu( spins_h )
randomNumbers_d = curandom.rand((nData))
#For texture version
spinsInArray_d, copy2D_dtod = gpuArray2DtocudaArray( spinsOut_d )
#For shared version
finalMemory = getFreeMemory( show=False )
print " Total Global Memory Used: {0} Mbytes\n".format(float(initialMemory-finalMemory)/1e6) 
########################################################################
########################################################################


#configure animation2D functions and plotData
animation2D.stepFunc = stepFunction
animation2D.specialKeys = specialKeyboardFunc
animation2D.plotData_d = spinsOut_d
animation2D.maxVar = np.float32(2)
animation2D.minVar = np.float32(-20)
コード例 #18
0
ファイル: chaosGlass.py プロジェクト: bvillasen/chaosGlass
cudaCodeStringRaw = cudaCodeFile.read() 
cudaCodeString = (cudaCodeStringRaw %{"HEIGHT":mapBlock[0], "B_HEIGHT":block2D[1], "B_WIDTH":block2D[0] }).replace("cudaP", precision)
cudaCode = SourceModule(cudaCodeString)
mappingLogisticKernel = cudaCode.get_function('mappingLogistic_kernel')
maskKernel = cudaCode.get_function('mask_kernel')
plotKernel = cudaCode.get_function('plot_kernel')
########################################################################
from pycuda.elementwise import ElementwiseKernel
########################################################################
linearDouble = ElementwiseKernel(arguments="cudaP a, cudaP b, cudaP *input, cudaP *output".replace( 'cudaP', precision),
				operation = "output[i] = a*input[i] + b ")

#Initialize all gpu data
print "Initializing Data"
initialMemory = getFreeMemory( show=True )  
random_d = curandom.rand((nData), dtype=npPrcsn) 
graphPoints_d= gpuarray.to_gpu( np.zeros([nData], dtype=npPrcsn) ) 	
#For plotting
maskPoints_h = np.ones(nData).astype(np.int32)
maskPoints_d = gpuarray.to_gpu( maskPoints_h )
plotData_d = gpuarray.to_gpu( np.zeros([nData], dtype=npPrcsn) )
finalMemory = getFreeMemory( show=False )
print " Total Global Memory Used: {0} Mbytes".format(float(initialMemory-finalMemory)/1e6) 

def replot():
  global xMin, xMax, yMin, yMax
  global jMin, jMax, iMin, iMax
  global random_d
  jMin, jMax = animation2D.jMin, animation2D.jMax
  iMin, iMax = animation2D.iMin, animation2D.iMax
  xMin += (xMax-xMin)*(float(jMin)/nWidth)