Example #1
0
def main(N=1024, L=100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9 * M * N) * float(K * L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L,
                                                               flops)
    na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0.

    t0 = time()

    sizeA = M * K
    sizeB = K * N
    sizeC = M * N

    h_A = (c_float * sizeA)()
    h_B = (c_float * sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    blockDim = dim3(BLOCK_SIZE, BLOCK_SIZE, 1)
    gridDim = dim3(N / BLOCK_SIZE, M / BLOCK_SIZE, 1)
    sharedMem = S4 * 2 * BLOCK_SIZE * BLOCK_SIZE
    tt = t0 = time() - t0
    print "Overhead runtime API: %.3f sec\n" % t0

    t0 = time()
    cudaThreadSynchronize()
    for i in range(L):
        cudaConfigureCall(gridDim, blockDim, sharedMem, 0)
        gpuSGEMM(d_C, d_A, d_B, K, N)
    cudaThreadSynchronize()
    t0 = time() - t0
    tt += t0

    h_C = (c_float * sizeC)()
    cudaMemcpy(h_C, d_C, S4 * sizeC, cudaMemcpyDeviceToHost)

    cudaThreadSynchronize()

    cudaFree(d_A)
    cudaFree(d_B)
    cudaFree(d_C)

    cudaThreadExit()
    print "Processing time: %.3g (%.3g) sec" % (t0, tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt)

    ref = (c_float * sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref, h_A, h_B, M, N, K)
    t1 = time() - t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops / t1)
    print "Speedup GPU/CPU: %.2f" % (t1 / t0)

    err, mxe = checkError(ref, h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
Example #2
0
    kr = int(sys.argv[1])
    dims = tuple([int(x) for x in sys.argv[2].split(",")])
except IndexError:
    sys.exit()

doComplex = False
if kr < 0:
    kr = -kr
    doComplex = True

size = reduce(lambda x, y: x * y, dims)
if doComplex:
    r = (c_float * (size * 2))()
else:
    r = (c_float * size)()
arrayInit(r)

sz = 1.e6 / float(size)

fftw_start = time.clock()
wall_start = time.time()

xr = float(.5) / float(kr)

if doComplex:
    text = "complex"
    rcfftx = xfft.ccfft
    crfftx = xfft.icfft
else:
    text = "   real"
    rcfftx = xfft.rcfft
Example #3
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()
    cublasInit()

    h_A = (c_float*na)()
    h_B = (c_float*nb)()
    h_C = (c_float*nc)()
    g_C = (c_float*nc)()

    arrayInit(h_A,na)
    arrayInit(h_B,nb)
    arrayInit(h_C,nc)

    d_A = c_void_p()
    d_B = c_void_p()
    d_C = c_void_p()

    cublasAlloc(na, sizeof(c_float), byref(d_A))
    cublasAlloc(nb, sizeof(c_float), byref(d_B))
    cublasAlloc(nc, sizeof(c_float), byref(d_C))
 
    cublasSetVector(na, sizeof(c_float), h_A, 1, d_A, 1)
    cublasSetVector(nb, sizeof(c_float), h_B, 1, d_B, 1)
    cublasSetVector(nc, sizeof(c_float), h_C, 1, d_C, 1)
    tt = t0 = time()-t0
    print "Overhead CUBLAS: %.3f sec\n" % t0

    t0 = time()
    for i in range(L):
        cublasSgemm('n', 'n', M, N, K, alfa, d_A, M, d_B, K, beta, d_C, M)
    cudaThreadSynchronize()
    t0 = time()-t0
    tt += t0

    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    t1 = time()
    for i in range(L):
        sgemm(h_C,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    cublasGetVector(nc, sizeof(c_float), d_C, 1, g_C, 1)
    err,mxe = checkError(h_C,g_C,nc)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_A)
    cublasFree(d_B)
    cublasFree(d_C)

    cublasShutdown()
Example #4
0
def main(check=False, doComplex=False, dims=(128, )):
    print "+------------------------+"
    print "| Fast Fourier Transform |"
    print "| using CUDA runtime API |"
    print "+------------------------+\n"
    dims = tuple(dims)
    ndim = len(dims)
    v = ("", "NX = %d", "NX = %d NY = %d", "NX = %d NY = %d NZ = %d")
    SC = reduce(lambda x, y: x * y, dims)
    SR = reduce(lambda x, y: x * y, dims[:-1], 1)
    SR *= 2 * (dims[-1] / 2 + 1)

    print v[ndim] % dims
    print "< doComplex: %s >\n" % doComplex

    rz = 1. / float(SC)
    flops = 2. * 5. * SC * log(SC) / log(2.) * 1.e-9
    if doComplex:
        SC *= 2
    S4 = sizeof(c_float)

    if doComplex:
        sz = S4 * (SC + SC) / (1024 * 1024)
    else:
        sz = S4 * (SC + SR) / (1024 * 1024)

    h_A = (c_float * SC)()
    g_A = (c_float * SC)()
    arrayInit(h_A)

    d_A = getMemory(h_A)
    allocate = True

    if doComplex:
        d_B = getMemory(SC)
    elif allocate:
        d_B = getMemory(SR)

    if doComplex:
        plan = gf.makePlan(dims, CUFFT_C2C)
    else:
        plan1 = gf.makePlan(dims, CUFFT_R2C)
        plan2 = gf.makePlan(dims, CUFFT_C2R)

    t0 = time()
    x0 = ReadTimestampCounter()
    cudaThreadSynchronize()

    if doComplex:
        d_B = gf.ccfft(plan, d_A, None, d_B)
        d_A = gf.icfft(plan, d_B, None, d_A)
    else:
        if allocate:
            d_B = gf.rcfft(plan1, d_A, None, d_B)
            d_A = gf.crfft(plan2, d_B, None, d_A)
        else:
            d_B = gf.rcfft(plan1, d_A, SR)
            cuMemFree(d_A)
            d_A = gf.crfft(plan2, d_B, SR)

    cudaThreadSynchronize()
    t0 = time() - t0
    x1 = ReadTimestampCounter()
    fc = 1.e-3 / 2.8
    print "RDTSC: %.0f µs" % ((x1 - x0) * fc)

    cudaMemcpy(g_A, d_A, S4 * SC, cudaMemcpyDeviceToHost)

    cudaFree(d_A)
    cudaFree(d_B)

    if doComplex:
        cufftDestroy(plan)
    else:
        cufftDestroy(plan1)
        cufftDestroy(plan2)

    cudaThreadExit()
    scale(g_A, rz)

    print "\nProcessing time: %.3g sec" % t0
    print "Gigaflops GPU  : %.2f" % (flops / t0)
    gflops = (flops / t0, )

    print "\nError CPU initial vs GPU"
    err, mxe = checkError(h_A, g_A)
    stats = err, mxe
    print "Avg and max rel error = %.2e %.2e\n" % (err, mxe)

    if check:
        t1 = time()
        if doComplex:
            h_B = xf.ccfft(h_A, dims)
            h_B = xf.icfft(h_B, dims)
        else:
            h_B = xf.rcfft(h_A, dims)
            h_B = xf.crfft(h_B, dims)
        t1 = time() - t1
        print "Processing time: %.3g sec" % t1
        print "Gigaflops CPU  : %.2f" % (flops / t1)
        print "Speedup GPU/CPU: %.2f" % (t1 / t0)

        print "\nError CPU final vs CPU initial"
        err, mxe = checkError(h_B, h_A)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)

        print "\nError CPU final vs GPU"
        err, mxe = checkError(h_B, g_A)
        print "Avg and max rel error = %.2e %.2e" % (err, mxe)
    f = (-1., )
    if check:
        f = (t1 / t0, )
    fmt = "\n## " + " ".join(len(dims) * ["%3d"]) + " : %.1f %.1f: %.2e %.2e"
    print fmt % (dims + gflops + f + stats)
Example #5
0
def main(N=1024, L=100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9 * M * N) * float(K * L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L,
                                                               flops)
    na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0.

    t0 = time()
    device = cu_CUDA()
    device.getSourceModule("gpuFunctions.cubin")
    gpuSGEMM = device.getFunction("gpuSGEMM")

    sizeA = M * K
    sizeB = K * N
    sizeC = M * N

    h_A = (c_float * sizeA)()
    h_B = (c_float * sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    cuFuncSetBlockShape(gpuSGEMM, BLOCK_SIZE, BLOCK_SIZE, 1)
    cuFuncSetSharedSize(gpuSGEMM, 2 * BLOCK_SIZE * BLOCK_SIZE * S4)
    cuParamSeti(gpuSGEMM, 0, d_C)
    cuParamSeti(gpuSGEMM, 4, d_A)
    cuParamSeti(gpuSGEMM, 8, d_B)
    cuParamSeti(gpuSGEMM, 12, K)
    cuParamSeti(gpuSGEMM, 16, N)
    cuParamSetSize(gpuSGEMM, 20)
    tt = t0 = time() - t0
    print "Overhead driver API: %.3f sec\n" % t0

    t0 = time()
    cuCtxSynchronize()
    for i in range(L):
        cuLaunchGrid(gpuSGEMM, N / BLOCK_SIZE, M / BLOCK_SIZE)
    cuCtxSynchronize()
    t0 = time() - t0
    tt += t0

    h_C = (c_float * sizeC)()
    cuMemcpyDtoH(h_C, d_C, S4 * sizeC)
    cuCtxSynchronize()

    cuMemFree(d_A)
    cuMemFree(d_B)
    cuMemFree(d_C)
    cuCtxDetach(device.context)

    print "Processing time: %.3g (%.3g) sec" % (t0, tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt)

    ref = (c_float * sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref, h_A, h_B, M, N, K)
    t1 = time() - t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops / t1)
    print "Speedup GPU/CPU: %.2f" % (t1 / t0)

    err, mxe = checkError(ref, h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
Example #6
0
def main(check=False,doComplex=False,dims=(128,)):
    print "+------------------------+"
    print "| Fast Fourier Transform |"
    print "| using CUDA runtime API |"
    print "+------------------------+\n"
    dims = tuple(dims)
    ndim = len(dims)
    v = ("","NX = %d","NX = %d NY = %d","NX = %d NY = %d NZ = %d")
    SC = reduce(lambda x,y:x*y,dims)
    SR = reduce(lambda x,y:x*y,dims[:-1],1)
    SR *= 2*(dims[-1]/2+1)

    print v[ndim] % dims
    print "< doComplex: %s >\n" % doComplex

    rz = 1./float(SC)
    flops = 2.*5.*SC*log(SC)/log(2.)*1.e-9
    if doComplex:
        SC *= 2
    S4 = sizeof(c_float)

    if doComplex:
        sz = S4*(SC+SC)/(1024*1024)
    else:
        sz = S4*(SC+SR)/(1024*1024)

    h_A = (c_float*SC)()
    g_A = (c_float*SC)()
    arrayInit(h_A)

    d_A = getMemory(h_A)
    allocate = True

    if doComplex:
        d_B = getMemory(SC)
    elif allocate:
        d_B = getMemory(SR)

    if doComplex:
        plan = gf.makePlan(dims,CUFFT_C2C)
    else:
        plan1 = gf.makePlan(dims,CUFFT_R2C)
        plan2 = gf.makePlan(dims,CUFFT_C2R)

    t0 = time()
    x0 = ReadTimestampCounter()
    cudaThreadSynchronize()

    if doComplex:
        d_B = gf.ccfft(plan,d_A,None,d_B)
        d_A = gf.icfft(plan,d_B,None,d_A)
    else:
        if allocate:
            d_B = gf.rcfft(plan1,d_A,None,d_B)
            d_A = gf.crfft(plan2,d_B,None,d_A)
        else:
            d_B = gf.rcfft(plan1,d_A,SR)
            cuMemFree(d_A)
            d_A = gf.crfft(plan2,d_B,SR)

    cudaThreadSynchronize()
    t0 = time()-t0
    x1 = ReadTimestampCounter()
    fc = 1.e-3/2.8
    print "RDTSC: %.0f µs" % ((x1-x0)*fc)

    cudaMemcpy(g_A,d_A,S4*SC,cudaMemcpyDeviceToHost)

    cudaFree(d_A)
    cudaFree(d_B)

    if doComplex:
        cufftDestroy(plan)
    else:
        cufftDestroy(plan1)
        cufftDestroy(plan2)

    cudaThreadExit()
    scale(g_A,rz)

    print "\nProcessing time: %.3g sec" % t0
    print "Gigaflops GPU  : %.2f" % (flops/t0)
    gflops = (flops/t0,)

    print "\nError CPU initial vs GPU"
    err,mxe = checkError(h_A,g_A)
    stats = err,mxe
    print "Avg and max rel error = %.2e %.2e\n" % (err,mxe)

    if check:
        t1 = time()
        if doComplex:
            h_B = xf.ccfft(h_A,dims)
            h_B = xf.icfft(h_B,dims)
        else:
            h_B = xf.rcfft(h_A,dims)
            h_B = xf.crfft(h_B,dims)
        t1 = time()-t1
        print "Processing time: %.3g sec" % t1
        print "Gigaflops CPU  : %.2f" % (flops/t1)
        print "Speedup GPU/CPU: %.2f" % (t1/t0)

        print "\nError CPU final vs CPU initial"
        err,mxe = checkError(h_B,h_A)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)

        print "\nError CPU final vs GPU"
        err,mxe = checkError(h_B,g_A)
        print "Avg and max rel error = %.2e %.2e" % (err,mxe)
    f = (-1.,)
    if check:
        f = (t1/t0,)
    fmt = "\n## "+" ".join(len(dims)*["%3d"])+" : %.1f %.1f: %.2e %.2e"
    print fmt % (dims+gflops+f+stats)
Example #7
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()

    sizeA = M*K
    sizeB = K*N
    sizeC = M*N

    h_A = (c_float*sizeA)()
    h_B = (c_float*sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    blockDim  = dim3(BLOCK_SIZE,BLOCK_SIZE,1)
    gridDim   = dim3(N/BLOCK_SIZE,M/BLOCK_SIZE,1)
    sharedMem = S4*2*BLOCK_SIZE*BLOCK_SIZE
    tt = t0 = time()-t0
    print "Overhead runtime API: %.3f sec\n" % t0

    t0 = time()
    cudaThreadSynchronize()
    for i in range(L):
        cudaConfigureCall(gridDim,blockDim,sharedMem,0)
        gpuSGEMM(d_C,d_A,d_B,K,N)
    cudaThreadSynchronize()
    t0 = time()-t0
    tt += t0

    h_C = (c_float*sizeC)()
    cudaMemcpy(h_C,d_C,S4*sizeC,cudaMemcpyDeviceToHost)

    cudaThreadSynchronize()

    cudaFree(d_A)
    cudaFree(d_B)
    cudaFree(d_C)

    cudaThreadExit()
    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    ref = (c_float*sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    err,mxe = checkError(ref,h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
Example #8
0
    kr = int(sys.argv[1])
    dims = tuple([int(x) for x in sys.argv[2].split(",")])
except IndexError:
    sys.exit()

doComplex = False
if kr < 0:
    kr = -kr
    doComplex = True

size = reduce(lambda x,y:x*y,dims)
if doComplex:
    r = (c_float*(size*2))()
else:
    r = (c_float*size)()
arrayInit(r)

sz = 1.e6/float(size)

fftw_start = time.clock()
wall_start = time.time()


xr = float(.5 )/float(kr)

if doComplex:
    text = "complex"
    rcfftx = xfft.ccfft
    crfftx = xfft.icfft
else:
    text = "   real"
Example #9
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()
    device = cu_CUDA()
    device.getSourceModule("gpuFunctions.cubin")
    gpuSGEMM = device.getFunction("gpuSGEMM")

    sizeA = M*K
    sizeB = K*N
    sizeC = M*N

    h_A = (c_float*sizeA)()
    h_B = (c_float*sizeB)()

    arrayInit(h_A)
    arrayInit(h_B)

    d_A = getMemory(h_A)
    d_B = getMemory(h_B)
    d_C = getMemory(sizeC)

    cuFuncSetBlockShape(gpuSGEMM,BLOCK_SIZE,BLOCK_SIZE,1)
    cuFuncSetSharedSize(gpuSGEMM,2*BLOCK_SIZE*BLOCK_SIZE*S4)
    cuParamSeti(gpuSGEMM,0,d_C)
    cuParamSeti(gpuSGEMM,4,d_A)
    cuParamSeti(gpuSGEMM,8,d_B)
    cuParamSeti(gpuSGEMM,12,K)
    cuParamSeti(gpuSGEMM,16,N)
    cuParamSetSize(gpuSGEMM,20)
    tt = t0 = time()-t0
    print "Overhead driver API: %.3f sec\n" % t0

    t0 = time()
    cuCtxSynchronize()
    for i in range(L):
        cuLaunchGrid(gpuSGEMM,N/BLOCK_SIZE,M/BLOCK_SIZE)
    cuCtxSynchronize()
    t0 = time()-t0
    tt += t0

    h_C = (c_float*sizeC)()
    cuMemcpyDtoH(h_C,d_C,S4*sizeC)
    cuCtxSynchronize()

    cuMemFree(d_A)
    cuMemFree(d_B)
    cuMemFree(d_C)
    cuCtxDetach(device.context)

    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    ref = (c_float*sizeC)()

    t1 = time()
    for i in range(L):
        sgemm(ref,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    err,mxe = checkError(ref,h_C)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)