コード例 #1
0
ファイル: cuda.py プロジェクト: kazunori279/chainer
def shutdown():
    """Finalizes CUDA global state.

    This function is automatically called by :mod:`atexit`. Multiple calls are
    allowed, so user can manually call this function if necessary.

    """
    global _contexts, _cublas_handles, _pid, _pools

    pid = os.getpid()
    if _pid != pid:  # not initialized
        return

    for cublas_handle in six.itervalues(_cublas_handles):
        cublas.cublasDestroy(cublas_handle)
    _cublas_handles = {}

    cumisc.shutdown()

    _pools = {}

    for ctx in six.itervalues(_contexts):
        ctx.detach()
    _contexts = {}
    _pid = None  # mark as uninitialized
コード例 #2
0
ファイル: main.py プロジェクト: dmmiron/fast_gpu_net
def classify(image_names, model_file_name, output_names):
    """
    Classify a set of images using the given model.
    
    Parameters
    ----------
    image_names : iterable of strings
        names of the input images
    model_file_name : string
        name of the file containing the model
    output_names : iterable of strings
        names of the output images
    
    Notes
    -----
    image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx]
    """
    handle = cublas.cublasCreate()
    model = serial.load(model_file_name)
    outputs = []
    for image_name, output_name in zip(image_names, output_names):
        image = load_image(image_name)
        output = classify_image(image, model, handle)
        save_image(np.int32(np.round(output*255)), output_name)
    cublas.cublasDestroy(handle)
コード例 #3
0
ファイル: cuda.py プロジェクト: ALEXGUOQ/chainer
def shutdown():
    """Finalizes CUDA global state.

    This function is automatically called by :mod:`atexit`. Multiple calls are
    allowed, so user can manually call this function if necessary.

    """
    global _contexts, _cublas_handles, _pid, _pools

    pid = os.getpid()
    if _pid != pid:  # not initialized
        return

    for cublas_handle in _cublas_handles.itervalues():
        cublas.cublasDestroy(cublas_handle)
    _cublas_handles = {}

    cumisc.shutdown()

    _pools = {}

    for ctx in _contexts.itervalues():
        ctx.detach()
    _contexts = {}
    _pid      = None  # mark as uninitialized
コード例 #4
0
ファイル: build_delaunay.py プロジェクト: Castronova/cuda
def transpose(a):
    '''
    https://github.com/lebedov/scikit-cuda/issues/33
    pip install --upgrade --no-deps git+https://github.com/lebedov/scikits.cuda.git

    :return:
    '''
    import time
    import numpy as np
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import scikits.cuda.cublas as cublas

    handle = cublas.cublasCreate()
    # N = 1000
    # a = np.random.rand(N, N)
    R =  a.shape[0]
    C = a.shape[1]
    a_gpu = gpuarray.to_gpu(a)
    a_trans_gpu = gpuarray.zeros((C, R), dtype=np.double)
    alpha = 1.0
    beta = 0.0
    start = time.time()
    cublas.cublasDgeam(handle, 't', 'n', R, R,
                       alpha, a_gpu.gpudata, R,
                       beta, a_gpu.gpudata, R,
                       a_trans_gpu.gpudata, R)
    print time.time()-start
    # assert np.allclose(a_trans_gpu.get(), a.T)
    cublas.cublasDestroy(handle)

    return a_trans_gpu
コード例 #5
0
ファイル: cuda_alternatives.py プロジェクト: amilsted/evoMPS
def calc_x(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh):
    handle = cb.cublasCreate()
    
    if not rp1 is None:
        rp1 = garr.to_gpu(sp.asarray(rp1))
    if not lm2 is None:
        lm2 = garr.to_gpu(sp.asarray(lm2))
    
    lm1_s = garr.to_gpu(sp.asarray(lm1_s))
    lm1_si = garr.to_gpu(sp.asarray(lm1_si))
    
    r_s = garr.to_gpu(sp.asarray(r_s))
    r_si = garr.to_gpu(sp.asarray(r_si))
    
    A = list(map(garr.to_gpu, A))
    if not Am1 is None:
        Am1 = list(map(garr.to_gpu, Am1))
    if not Ap1 is None:
        Ap1 = list(map(garr.to_gpu, Ap1))
    
    Vsh = list(map(garr.to_gpu, Vsh))
    
    if not Cm1 is None:
        Cm1 = [[garr.to_gpu(Cm1[t, s]) for t in range(Cm1.shape[1])] for s in range(Cm1.shape[0])]
        
    if not (C is None and Kp1 is None):
        C = [[garr.to_gpu(C[s, t]) for t in range(C.shape[1])] for s in range(C.shape[0])]
        Kp1 = garr.to_gpu(Kp1)
    
    x = calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=handle)
        
    cb.cublasDestroy(handle)
    
    return x.get()
コード例 #6
0
ファイル: main.py プロジェクト: dmmiron/fast_gpu_net
def classify(image_names, model_file_name, output_names):
    """
    Classify a set of images using the given model.
    
    Parameters
    ----------
    image_names : iterable of strings
        names of the input images
    model_file_name : string
        name of the file containing the model
    output_names : iterable of strings
        names of the output images
    
    Notes
    -----
    image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx]
    This network copies the weights to the gpu once to classify all the images as it should. This can be used as a model 
    to make the same change to the fully connected network.
    """
    handle = cublas.cublasCreate()
    model = serial.load(model_file_name)

    layers = model.layers
    convs = layers[:-1]; softmax = layers[-1];
    convs = map(lambda layer: layer.get_params(), convs)
    kernels = map(lambda layer: np.array(layer[0].eval()), convs)

    #This can be simplified
    kernels = map(lambda kernel: np.ascontiguousarray(np.rollaxis(kernel, 0, 3)), kernels)
    kdims = map(lambda kernel: kernel.shape, kernels)
    kernels = map(lambda layer: layer[0].dimshuffle(3, 0, 1, 2).eval(), convs)
    kernels = map(lambda kernel, kdim: kernel.reshape(kdim), kernels, kdims)
    
    
    biases = map(lambda layer: np.array(layer[1].eval()), convs)
    bias_dims = map(lambda bias: bias.shape, biases)
    max_sizes = map(lambda layer: layer.pool_shape + [layer.num_pieces], layers[:-1])
    
    weights = softmax.get_params()[1]; bias = softmax.get_params()[0];
    
    soft_weights = softmax.get_params()[1].reshape((3, 3, 32, 2)).dimshuffle(3, 2, 0, 1).eval()
    soft_weights = np.ascontiguousarray(np.reshape(soft_weights, (2, 288)).transpose())
    soft_bias = softmax.get_params()[0].get_value()[::1]

    window = layers[0].input_space.shape
    outputs = []
    for image_name, output_name in zip(image_names, output_names):
        image = load_image(image_name)
        output = classify_image(image, model, kernels, biases, max_sizes, soft_weights, soft_bias, window, handle)
        save_image(np.int8(np.round(output*255)), output_name)
    cublas.cublasDestroy(handle)
コード例 #7
0
def calc_x(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh):
    handle = cb.cublasCreate()

    if not rp1 is None:
        rp1 = garr.to_gpu(sp.asarray(rp1))
    if not lm2 is None:
        lm2 = garr.to_gpu(sp.asarray(lm2))

    lm1_s = garr.to_gpu(sp.asarray(lm1_s))
    lm1_si = garr.to_gpu(sp.asarray(lm1_si))

    r_s = garr.to_gpu(sp.asarray(r_s))
    r_si = garr.to_gpu(sp.asarray(r_si))

    A = list(map(garr.to_gpu, A))
    if not Am1 is None:
        Am1 = list(map(garr.to_gpu, Am1))
    if not Ap1 is None:
        Ap1 = list(map(garr.to_gpu, Ap1))

    Vsh = list(map(garr.to_gpu, Vsh))

    if not Cm1 is None:
        Cm1 = [[garr.to_gpu(Cm1[t, s]) for t in range(Cm1.shape[1])]
               for s in range(Cm1.shape[0])]

    if not (C is None and Kp1 is None):
        C = [[garr.to_gpu(C[s, t]) for t in range(C.shape[1])]
             for s in range(C.shape[0])]
        Kp1 = garr.to_gpu(Kp1)

    x = calc_x_G(Kp1,
                 C,
                 Cm1,
                 rp1,
                 lm2,
                 Am1,
                 A,
                 Ap1,
                 lm1_s,
                 lm1_si,
                 r_s,
                 r_si,
                 Vsh,
                 handle=handle)

    cb.cublasDestroy(handle)

    return x.get()
コード例 #8
0
ファイル: main.py プロジェクト: dmmiron/fast_gpu_net
def main():
    """
    For testing and timing. 
    """
    handle = cublas.cublasCreate() 
    image = np.float32((np.random.rand(1024, 1024) - .5) * 2)
    model = serial.load(model_file_name)
    layers = model.layers
    
    patch_dims = (39, 39)
    #There is a bug that occurs if running with too long a batch_rows_l
    #Most likely a memory allocation issue that is not being reported correctly
    batch_rows_l = [8] 
    batchsizes = map(lambda x: x*(1024-39+1), batch_rows_l)
    pixels = [(x, y) for x in range(1024-39+1) for y in range(1024-39+1)]
    
    #Uncomment to use pylearn2 to classify to check result
    p_output = pylearn2_computation(model, image, patch_dims, batchsizes[0], pixels)
    p_output = np.transpose(p_output)
    num_trials = 1
    for batchsize, batch_rows in zip(batchsizes, batch_rows_l):
        st = time.time()
        for trial in range(num_trials):
            output = gpu_computation(image, patch_dims, batchsize, batch_rows, layers, pixels, handle)
            output = output.get()
        tot = time.time()-st
        print "Batchsize {0}".format(batchsize)
        print "Total time: {0:.4e} seconds".format(tot)
        print "Time per pixel: {0:.4e} seconds".format(tot/len(pixels*num_trials))
        print "Pixels per second: {0:.4e}".format(len(pixels*num_trials)/tot)
    for end in time_ends:
        end.synchronize()
    sgemm_times = map(lambda start, end: end.time_since(start)/1000, time_starts, time_ends)
    tot_sgemm_time = sum(sgemm_times)
    print "Total sgemm time: {0:.4e} seconds\nTotal gflop: {1:.4e}\nGflops: {2:.4e}".format(tot_sgemm_time, sgemm_gflop, sgemm_gflop/tot_sgemm_time)

    #Uncomment to compare results of gpu and pylearn2 classifications 
    #output = output.reshape(1024-39, 1024-39)
    print output, p_output
    
    print np.allclose(p_output[0], output, rtol=1e-04, atol=1e-07)
    cublas.cublasDestroy(handle)
    
    return 
コード例 #9
0
ファイル: test_sgemm.py プロジェクト: dmmiron/fast_gpu_net
def main():
    m = 64; k = 512; n = 400;
    #m = 2; k = 3; n = 4;
    handle = cublas.cublasCreate()
    _, narrays, batchsize = sys.argv
    narrays = int(narrays); batchsize = int(batchsize);
    
    cols = []; kernels = []; biases = [];
    pcols = []; pkernels = []; pbiases= []; #lists to stores pointers to gpu arrays
    kernel = np.float32((np.random.rand(m, k) -.5) * 2)
    kernel = np.float32(np.reshape(np.arange(0, m*k, 1), [m, k]))
    for i in range(narrays):
        col = np.float32((np.random.rand(k, n) - .5) * 2)
        #col = np.float32(np.reshape(np.arange(0, k*n, 1), [k, n]))
        bias = np.float32(np.zeros((m, n)))
        col_d = gpu.to_gpu(col)
        kernel_d = gpu.to_gpu(kernel)
        bias_d = gpu.to_gpu(bias)
        cols.append(col_d); kernels.append(kernel_d); biases.append(bias_d);
        pcols.append(col_d.ptr); pkernels.append(kernel_d.ptr); pbiases.append(bias_d.ptr);
    pcols = np.array(pcols); pkernels = np.array(pkernels); pbiases = np.array(pbiases); 
    pcols_d = gpu.to_gpu(pcols); pkernels_d = gpu.to_gpu(pkernels); pbiases_d = gpu.to_gpu(pbiases);
    
    for i in range(narrays):
        compute_sgemm(cols[i], kernels[i], biases[i], 0, handle);
    #zero out arrays for checking results
    #for i in range(narrays):
        #print biases[i]
    #    biases[i] -= biases[i]
    print "\n\n"
    for i in range((narrays+batchsize-1)/batchsize):
        start = i*batchsize
        compute_sgemm_batched(pcols_d[start:start+batchsize], pkernels_d[start:start+batchsize], pbiases_d[start:start+batchsize], m, k, n, 0, handle)
    #for i in range(narrays):
    #    print biases[i]
    cublas.cublasDestroy(handle)
コード例 #10
0
ファイル: cublas.py プロジェクト: zky001/nervanagpu
                ng.dot(devA1,
                       devB1,
                       devC1,
                       alpha=alpha,
                       beta=beta,
                       repeat=repeat)

                cublas_dot(devA2,
                           devB2,
                           devC2,
                           alpha=alpha,
                           beta=beta,
                           repeat=repeat)

                partial1 = ng.empty((devC1.shape[0], 1), dtype=np.float32)
                partial2 = partial1[0:1, 0:1]

                diff = ng.max(abs(devC2 - devC1),
                              partial=partial1,
                              out=partial2).get()[0, 0]
                mean = ng.mean(abs(devC2), partial=partial1,
                               out=partial2).get()[0, 0]

                #if diff > .1:
                print "Error: %.3f%%" % (100 * diff / mean)

                print "--------------------------------------------------------------------------------"

cublas.cublasDestroy(handle)
コード例 #11
0
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh):
    GA = []
    for An in A:
        if An is None:
            GA.append(None)
        else:
            GAn = []
            for Ans in An:
                GAn.append(garr.to_gpu(Ans))
            GA.append(GAn)
    GA.append(None)

    Gl = []
    Gl_s = []
    Gl_si = []
    for n in range(len(l)):
        if l[n] is None:
            Gl.append(None)
            Gl_s.append(None)
            Gl_si.append(None)
        else:
            Gl.append(garr.to_gpu(sp.asarray(
                l[n])))  #TODO: Support special types...
            Gl_s.append(garr.to_gpu(sp.asarray(l_s[n])))
            Gl_si.append(garr.to_gpu(sp.asarray(l_si[n])))
    Gl.append(None)
    Gl_s.append(None)
    Gl_si.append(None)

    Gr = []
    Gr_s = []
    Gr_si = []
    for n in range(len(r)):
        if r[n] is None:
            Gr.append(None)
            Gr_s.append(None)
            Gr_si.append(None)
        else:
            Gr.append(garr.to_gpu(sp.asarray(
                r[n])))  #TODO: Support special types...
            Gr_s.append(garr.to_gpu(sp.asarray(r_s[n])))
            Gr_si.append(garr.to_gpu(sp.asarray(r_si[n])))
    Gr.append(None)
    Gr_s.append(None)
    Gr_si.append(None)

    GK = []
    for n in range(len(K)):
        if K[n] is None:
            GK.append(None)
        else:
            GK.append(garr.to_gpu(sp.asarray(K[n])))
    GK.append(None)

    GVsh = []
    for n in range(len(Vsh)):
        if Vsh[n] is None:
            GVsh.append(None)
        else:
            GVshn = []
            for s in range(Vsh[n].shape[0]):
                GVshn.append(garr.to_gpu(Vsh[n][s]))
            GVsh.append(GVshn)

    GC = []
    for n in range(len(C)):
        if C[n] is None:
            GC.append(None)
        else:
            GCn = []
            for s in range(C[n].shape[0]):
                GCns = []
                for t in range(C[n].shape[1]):
                    GCns.append(garr.to_gpu(C[n][s, t]))
                GCn.append(GCns)
            GC.append(GCn)
    GC.append(None)

    GCts = []
    for n in range(len(GC)):
        if GC[n] is None:
            GCts.append(None)
        else:
            GCtsn = []
            for t in range(len(GC[n])):
                GCtsns = []
                for s in range(len(GC[n][0])):
                    GCtsns.append(GC[n][s][t])
                GCtsn.append(GCtsns)
            GCts.append(GCtsn)

    hdl = cb.cublasCreate()

    num_strms = 10

    curr_stream = cb.cublasGetStream(hdl)

    sites_per_strm = max((N) // num_strms, 1)
    #print "sites_per_stream = ", sites_per_strm

    strms = []
    for i in range(N // sites_per_strm):
        strms.append(cd.Stream())

    GB = [None]
    for n in range(1, N + 1):
        if (n - 1) % sites_per_strm == 0:
            #print n
            #print "strm = ", (n - 1) // sites_per_strm
            cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle)
        if not Vsh[n] is None:
            if n > 1:
                Glm2 = Gl[n - 2]
            else:
                Glm2 = None

            Gx = calc_x_G(GK[n + 1],
                          GC[n],
                          GCts[n - 1],
                          Gr[n + 1],
                          Glm2,
                          GA[n - 1],
                          GA[n],
                          GA[n + 1],
                          Gl_s[n - 1],
                          Gl_si[n - 1],
                          Gr_s[n],
                          Gr_si[n],
                          GVsh[n],
                          handle=hdl)
            GBn = []
            for s in range(A[n].shape[0]):
                GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl)
                GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl)
                GBns = cla.dot(GBns, Gr_si[n], handle=hdl)
                GBn.append(GBns)
            GB.append(GBn)
        else:
            GB.append(None)

    cb.cublasSetStream(hdl, curr_stream)
    cb.cublasDestroy(hdl)

    B = [None]
    for n in range(1, N + 1):
        if GB[n] is None:
            B.append(None)
        else:
            Bn = sp.empty_like(A[n])
            for s in range(A[n].shape[0]):
                Bn[s] = GB[n][s].get()
            B.append(Bn)

    return B
コード例 #12
0
 def close_cuda(self):
     if not self.hdl is None:
         cb.cublasDestroy(self.hdl)
         self.hdl = None
コード例 #13
0
ファイル: cublas2.py プロジェクト: KayneWest/nervanagpu
            glops = max(glops16, glops32, glops64, glops128)

            if glops16 == glops:
                fastest = 16
            elif glops32 == glops:
                fastest = 32
            elif glops64 == glops:
                fastest = 64
            else:
                fastest = 128

            glopsref = cublas_dot(devA2, devB2, devC2, repeat=repeat)

            partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32)
            partial2 = partial1[0:1,0:1]

            diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0]
            mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0]

            flops_diff = glops - glopsref

            note = "**************" if flops_diff <= 0 else ""
            
            print "Faster: %.0f gflops Choice: %d Error: %.3f%%%s" % (flops_diff, fastest, 100 * diff / mean, note)

        print "--------------------------------------------------------------------------------"


cublas.cublasDestroy(handle)
コード例 #14
0
ファイル: main.py プロジェクト: dmmiron/fast_gpu_net
def gpu_computation(image, kernels, biases, max_sizes, soft_weights, soft_bias, batches, window_sizes):
    nbatches = len(batches)
    batchsize = len(batches[0])
    npixels = nbatches*batchsize
    layers = len(kernels)
    handle = cublas.cublasCreate()
    results = []
    result_ps = []
    pad = 0; stride = 1; 
    full_image_d = gpu.to_gpu(image)

    image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s = compute_dims(image, kernels, biases, max_sizes, batchsize, window_sizes, pad, stride)
    
    b_result = [];
    b_offsets_d = [];
    
    kernels_d = [];
    cols = []; col_ps = [];
    biases_d = [];
    sgemm_biases = []; sgemm_biases_ps = [];
    outputs = [];

    for layer_n, (bias, kernel, sgemm_dim, im_dim, out_dim, max_ksize, ksize, kchannels) in enumerate(zip(biases, kernels, sgemm_dims, image_dims, out_dims, max_sizes, ksizes, kchannels_s)):
        col = gpu.empty((batchsize, sgemm_dim[1], sgemm_dim[2]), np.float32) 
        cols.append(col)
        col_ps.append([col[idx, :, :].ptr for idx in range(batchsize)])
        
        #reuse the same kernels for every pixel
        kernel_d = gpu.to_gpu(kernel)
        kernel_d = kernel_d.reshape(kchannels, ksize*ksize*im_dim[2])
        kernels_d.append(kernel_d)

 
        #contain the actual data of the biases
        bias = bias.reshape(1, bias.shape[2], bias.shape[0]*bias.shape[1])
        batch_bias = np.tile(bias, (batchsize, 1, 1))
        batch_bias_d = gpu.to_gpu(batch_bias)
        biases_d.append(batch_bias_d)
        
        #scratch space to copy biases to and then write output of sgemm to
        sgemm_bias = gpu.empty(batch_bias.shape, np.float32)
        sgemm_biases.append(sgemm_bias)
        
        sgemm_biases_ps.append([sgemm_bias[idx, :, :].ptr for idx in range(batchsize)])

        #space for output of maxpool
        output = gpu.empty((batchsize, out_dim[2], out_dim[0], out_dim[1]), np.float32)
        outputs.append(output)

    #space for final output
    classes = gpu.empty(npixels, np.float32)
    soft_weights_d = gpu.to_gpu(soft_weights)
    soft_bias = soft_bias.reshape(1, soft_bias.shape[0])
    soft_bias_d = gpu.to_gpu(np.ascontiguousarray(np.reshape(np.tile(soft_bias, (batchsize, 1)), (2, batchsize))))
    soft_bias_scratch = gpu.empty((soft_bias_d.shape[0], soft_bias_d.shape[1]), np.float32)

    col_ps_d = gpu.to_gpu(np.array(col_ps))

    kernel_ps = map(lambda x: [x.ptr]*batchsize, kernels_d)
    kernel_ps_d = gpu.to_gpu(np.array(kernel_ps))

    sgemm_biases_ps_d = gpu.to_gpu(np.array(sgemm_biases_ps))

    for batch in batches:
        offsets = comp_offsets(batch, full_image_d)
        offsets_d = gpu.to_gpu(np.int32(np.array(offsets)))
        b_offsets_d.append(offsets_d);

        #space to hold final result of each layer
        result = gpu.empty((out_dims[layers-1][2], out_dims[layers-1][0], out_dims[layers-1][1]), np.float32)
        b_result.append(result)

    for batchn, (batch, offsets_d, result) in enumerate(zip(batches, b_offsets_d, b_result)):

        image_d = full_image_d
        for layer_n, (im_dim, col_dim, kdim, bias_dim, sgemm_dim, out_dim, ksize, kchannels, max_size) in enumerate(zip(image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s, max_sizes)):

            sgemm_bias = sgemm_biases[layer_n]
            cu.memcpy_dtod(sgemm_bias.ptr, biases_d[layer_n].ptr, sgemm_bias.nbytes)

            im2col_gpu.compute_im2col_batched(image_d, im_dim[0], im_dim[1], im_dim[2], np.int32(ksize), np.int32(pad), np.int32(stride), offsets_d, layer_n, batchsize, cols[layer_n])
            compute_sgemm_batched(col_ps_d[layer_n], kernel_ps_d[layer_n], sgemm_biases_ps_d[layer_n], handle, sgemm_dim[0], sgemm_dim[1], sgemm_dim[2])
            sgemm_bias = sgemm_bias.reshape(np.int32(batchsize), np.int32(kchannels), col_dim[0], col_dim[1])
            maxpool_gpu.compute_max_batched(sgemm_bias, outputs[layer_n], np.int32(max_size))
            image_d = outputs[layer_n]
        result = outputs[layers-1]
        result = result.reshape(result.shape[0], result.shape[1]*result.shape[2]*result.shape[3]) 
        cu.memcpy_dtod(soft_bias_scratch.ptr, soft_bias_d.ptr, soft_bias_d.nbytes)
        np_soft_weights = soft_weights_d.get()
        np_result = result.get()
        compute_sgemm(soft_weights_d, result, soft_bias_scratch, handle)
        
        offset = batchn*batchsize
        soft_max_in = soft_bias_scratch
        soft_max.compute_soft_max(soft_max_in, classes, offset)
        result_ps.append(result)
        
    cublas.cublasDestroy(handle)
    return classes
コード例 #15
0
ファイル: matrix.py プロジェクト: Captricity/sciguppy
def destroy_cublas():
    cublas.cublasDestroy(handle)
コード例 #16
0
ファイル: linalg.py プロジェクト: bionet/vtem
 def destroy(self):
     if self.handle is not None:
         cublas.cublasDestroy(self.handle)
コード例 #17
0
 def tearDown(self):
     cublas.cublasDestroy(self.cublas_handle)
コード例 #18
0
ファイル: cuda_alternatives.py プロジェクト: amilsted/evoMPS
 def close_cuda(self):
     if not self.hdl is None:
         cb.cublasDestroy(self.hdl)
         self.hdl = None
コード例 #19
0
ファイル: cuda_alternatives.py プロジェクト: amilsted/evoMPS
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh):
    GA = []
    for An in A:
        if An is None:
            GA.append(None)
        else:
            GAn = []
            for Ans in An:
                GAn.append(garr.to_gpu(Ans))
            GA.append(GAn)
    GA.append(None)
    
    Gl = []
    Gl_s = []
    Gl_si = []
    for n in range(len(l)):
        if l[n] is None:
            Gl.append(None)
            Gl_s.append(None)
            Gl_si.append(None)
        else:
            Gl.append(garr.to_gpu(sp.asarray(l[n]))) #TODO: Support special types...
            Gl_s.append(garr.to_gpu(sp.asarray(l_s[n])))
            Gl_si.append(garr.to_gpu(sp.asarray(l_si[n])))
    Gl.append(None)
    Gl_s.append(None)
    Gl_si.append(None)
        
    Gr = []
    Gr_s = []
    Gr_si = []
    for n in range(len(r)):
        if r[n] is None:
            Gr.append(None)
            Gr_s.append(None)
            Gr_si.append(None)
        else:
            Gr.append(garr.to_gpu(sp.asarray(r[n]))) #TODO: Support special types...
            Gr_s.append(garr.to_gpu(sp.asarray(r_s[n])))
            Gr_si.append(garr.to_gpu(sp.asarray(r_si[n])))
    Gr.append(None)
    Gr_s.append(None)
    Gr_si.append(None)

    GK = []
    for n in range(len(K)):
        if K[n] is None:
            GK.append(None)
        else:
            GK.append(garr.to_gpu(sp.asarray(K[n])))
    GK.append(None)
            
    GVsh = []
    for n in range(len(Vsh)):
        if Vsh[n] is None:
            GVsh.append(None)
        else:
            GVshn = []
            for s in range(Vsh[n].shape[0]):
                GVshn.append(garr.to_gpu(Vsh[n][s]))
            GVsh.append(GVshn)
    
    GC = []
    for n in range(len(C)):
        if C[n] is None:
            GC.append(None)
        else:
            GCn = []
            for s in range(C[n].shape[0]):
                GCns = []
                for t in range(C[n].shape[1]):
                    GCns.append(garr.to_gpu(C[n][s, t]))
                GCn.append(GCns)
            GC.append(GCn)
    GC.append(None)
    
    GCts = []
    for n in range(len(GC)):
        if GC[n] is None:
            GCts.append(None)
        else:
            GCtsn = []
            for t in range(len(GC[n])):
                GCtsns = []
                for s in range(len(GC[n][0])):
                    GCtsns.append(GC[n][s][t])
                GCtsn.append(GCtsns)
            GCts.append(GCtsn)
            
    hdl = cb.cublasCreate()
    
    num_strms = 10
    
    curr_stream = cb.cublasGetStream(hdl)
    
    sites_per_strm = max((N) // num_strms, 1)
    #print "sites_per_stream = ", sites_per_strm
    
    strms = []
    for i in range(N // sites_per_strm):
        strms.append(cd.Stream())
    
    GB = [None]
    for n in range(1, N + 1):
        if (n - 1) % sites_per_strm == 0:
            #print n
            #print "strm = ", (n - 1) // sites_per_strm
            cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle)
        if not Vsh[n] is None:
            if n > 1:
                Glm2 = Gl[n - 2]
            else:
                Glm2 = None
                
            Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n],
                          GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl)
            GBn = []
            for s in range(A[n].shape[0]):
                GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) 
                GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl)
                GBns = cla.dot(GBns, Gr_si[n], handle=hdl)
                GBn.append(GBns)
            GB.append(GBn)
        else:
            GB.append(None)
            
    cb.cublasSetStream(hdl, curr_stream)    
    cb.cublasDestroy(hdl)
    
    B = [None]
    for n in range(1, N + 1):
        if GB[n] is None:
            B.append(None)
        else:
            Bn = sp.empty_like(A[n])
            for s in range(A[n].shape[0]):
                Bn[s] = GB[n][s].get()
            B.append(Bn)
        
    return B
コード例 #20
0
ファイル: linalg.py プロジェクト: bionet/vtem
 def destroy(self):
     if self.handle is not None:
         cublas.cublasDestroy(self.handle)
コード例 #21
0
 def tearDown(self):
     cublas.cublasDestroy(self.cublas_handle)