Example #1
0
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle):
    D = A1[0].shape[0]

    for s in xrange(len(A1)):
        cb.cublasSetStream(handle, streams[s].handle)
        cb.cublasZgemm(handle, "N", "C", D, D, D, 1.0, x.gpudata, D, A1[s].gpudata, D, 0.0, tmp[s].gpudata, D)
        cb.cublasZgemm(handle, "N", "N", D, D, D, 1.0, A2[s].gpudata, D, tmp[s].gpudata, D, 0.0, tmp2[s].gpudata, D)

    for s in streams:
        s.synchronize()

    cb.cublasSetStream(handle, 0)
    out.fill(0)
    for s in xrange(len(A1)):
        cb.cublasZaxpy(handle, D * D, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1)

    return out
Example #2
0
def eps_l_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle):
    D = A1[0].shape[0]

    for s in range(len(A1)):
        cb.cublasSetStream(handle, streams[s].handle)
        cb.cublasZgemm(handle, 'N', 'C', D, D, D, 1., x.gpudata, D,
                       A1[s].gpudata, D, 0., tmp[s].gpudata, D)
        cb.cublasZgemm(handle, 'N', 'N', D, D, D, 1., A2[s].gpudata, D,
                       tmp[s].gpudata, D, 0., tmp2[s].gpudata, D)

    for s in streams:
        s.synchronize()

    cb.cublasSetStream(handle, 0)
    out.fill(0)
    for s in range(len(A1)):
        cb.cublasZaxpy(handle, D * D, 1., tmp2[s].gpudata, 1, out.gpudata, 1)

    return out
Example #3
0
def eps_r_noop_strm(x, A1, A2, out, tmp, tmp2, streams, handle):
    D = A1[0].shape[0]
    Dm1 = D

    for s in range(len(A1)):
        cb.cublasSetStream(handle, streams[s].handle)
        cb.cublasZgemm(handle, 'N', 'N', D, Dm1, D, 1., x.gpudata, D, 
                       A1[s].gpudata, D, 0., tmp[s].gpudata, D)
        cb.cublasZgemm(handle, 'C', 'N', Dm1, Dm1, D, 1., A2[s].gpudata, D, 
                       tmp[s].gpudata, D, 0., tmp2[s].gpudata, Dm1)
        
    for s in streams:
        s.synchronize()

    cb.cublasSetStream(handle, 0)
    out.fill(0)
    for s in range(len(A1)):
        #cb.cublasZgeam(handle, 'N', 'N', Dm1, Dm1, 0. if s == 0 else 1., out.gpudata, Dm1, 1., tmp2[s].gpudata, Dm1, out.gpudata, Dm1)
        cb.cublasZaxpy(handle, Dm1 * Dm1, 1., tmp2[s].gpudata, 1, out.gpudata, 1)
    	
    return out