def shutdown(): """Finalizes CUDA global state. This function is automatically called by :mod:`atexit`. Multiple calls are allowed, so user can manually call this function if necessary. """ global _contexts, _cublas_handles, _pid, _pools pid = os.getpid() if _pid != pid: # not initialized return for cublas_handle in six.itervalues(_cublas_handles): cublas.cublasDestroy(cublas_handle) _cublas_handles = {} cumisc.shutdown() _pools = {} for ctx in six.itervalues(_contexts): ctx.detach() _contexts = {} _pid = None # mark as uninitialized
def classify(image_names, model_file_name, output_names): """ Classify a set of images using the given model. Parameters ---------- image_names : iterable of strings names of the input images model_file_name : string name of the file containing the model output_names : iterable of strings names of the output images Notes ----- image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx] """ handle = cublas.cublasCreate() model = serial.load(model_file_name) outputs = [] for image_name, output_name in zip(image_names, output_names): image = load_image(image_name) output = classify_image(image, model, handle) save_image(np.int32(np.round(output*255)), output_name) cublas.cublasDestroy(handle)
def shutdown(): """Finalizes CUDA global state. This function is automatically called by :mod:`atexit`. Multiple calls are allowed, so user can manually call this function if necessary. """ global _contexts, _cublas_handles, _pid, _pools pid = os.getpid() if _pid != pid: # not initialized return for cublas_handle in _cublas_handles.itervalues(): cublas.cublasDestroy(cublas_handle) _cublas_handles = {} cumisc.shutdown() _pools = {} for ctx in _contexts.itervalues(): ctx.detach() _contexts = {} _pid = None # mark as uninitialized
def transpose(a): ''' https://github.com/lebedov/scikit-cuda/issues/33 pip install --upgrade --no-deps git+https://github.com/lebedov/scikits.cuda.git :return: ''' import time import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import scikits.cuda.cublas as cublas handle = cublas.cublasCreate() # N = 1000 # a = np.random.rand(N, N) R = a.shape[0] C = a.shape[1] a_gpu = gpuarray.to_gpu(a) a_trans_gpu = gpuarray.zeros((C, R), dtype=np.double) alpha = 1.0 beta = 0.0 start = time.time() cublas.cublasDgeam(handle, 't', 'n', R, R, alpha, a_gpu.gpudata, R, beta, a_gpu.gpudata, R, a_trans_gpu.gpudata, R) print time.time()-start # assert np.allclose(a_trans_gpu.get(), a.T) cublas.cublasDestroy(handle) return a_trans_gpu
def calc_x(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh): handle = cb.cublasCreate() if not rp1 is None: rp1 = garr.to_gpu(sp.asarray(rp1)) if not lm2 is None: lm2 = garr.to_gpu(sp.asarray(lm2)) lm1_s = garr.to_gpu(sp.asarray(lm1_s)) lm1_si = garr.to_gpu(sp.asarray(lm1_si)) r_s = garr.to_gpu(sp.asarray(r_s)) r_si = garr.to_gpu(sp.asarray(r_si)) A = list(map(garr.to_gpu, A)) if not Am1 is None: Am1 = list(map(garr.to_gpu, Am1)) if not Ap1 is None: Ap1 = list(map(garr.to_gpu, Ap1)) Vsh = list(map(garr.to_gpu, Vsh)) if not Cm1 is None: Cm1 = [[garr.to_gpu(Cm1[t, s]) for t in range(Cm1.shape[1])] for s in range(Cm1.shape[0])] if not (C is None and Kp1 is None): C = [[garr.to_gpu(C[s, t]) for t in range(C.shape[1])] for s in range(C.shape[0])] Kp1 = garr.to_gpu(Kp1) x = calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=handle) cb.cublasDestroy(handle) return x.get()
def classify(image_names, model_file_name, output_names): """ Classify a set of images using the given model. Parameters ---------- image_names : iterable of strings names of the input images model_file_name : string name of the file containing the model output_names : iterable of strings names of the output images Notes ----- image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx] This network copies the weights to the gpu once to classify all the images as it should. This can be used as a model to make the same change to the fully connected network. """ handle = cublas.cublasCreate() model = serial.load(model_file_name) layers = model.layers convs = layers[:-1]; softmax = layers[-1]; convs = map(lambda layer: layer.get_params(), convs) kernels = map(lambda layer: np.array(layer[0].eval()), convs) #This can be simplified kernels = map(lambda kernel: np.ascontiguousarray(np.rollaxis(kernel, 0, 3)), kernels) kdims = map(lambda kernel: kernel.shape, kernels) kernels = map(lambda layer: layer[0].dimshuffle(3, 0, 1, 2).eval(), convs) kernels = map(lambda kernel, kdim: kernel.reshape(kdim), kernels, kdims) biases = map(lambda layer: np.array(layer[1].eval()), convs) bias_dims = map(lambda bias: bias.shape, biases) max_sizes = map(lambda layer: layer.pool_shape + [layer.num_pieces], layers[:-1]) weights = softmax.get_params()[1]; bias = softmax.get_params()[0]; soft_weights = softmax.get_params()[1].reshape((3, 3, 32, 2)).dimshuffle(3, 2, 0, 1).eval() soft_weights = np.ascontiguousarray(np.reshape(soft_weights, (2, 288)).transpose()) soft_bias = softmax.get_params()[0].get_value()[::1] window = layers[0].input_space.shape outputs = [] for image_name, output_name in zip(image_names, output_names): image = load_image(image_name) output = classify_image(image, model, kernels, biases, max_sizes, soft_weights, soft_bias, window, handle) save_image(np.int8(np.round(output*255)), output_name) cublas.cublasDestroy(handle)
def main(): """ For testing and timing. """ handle = cublas.cublasCreate() image = np.float32((np.random.rand(1024, 1024) - .5) * 2) model = serial.load(model_file_name) layers = model.layers patch_dims = (39, 39) #There is a bug that occurs if running with too long a batch_rows_l #Most likely a memory allocation issue that is not being reported correctly batch_rows_l = [8] batchsizes = map(lambda x: x*(1024-39+1), batch_rows_l) pixels = [(x, y) for x in range(1024-39+1) for y in range(1024-39+1)] #Uncomment to use pylearn2 to classify to check result p_output = pylearn2_computation(model, image, patch_dims, batchsizes[0], pixels) p_output = np.transpose(p_output) num_trials = 1 for batchsize, batch_rows in zip(batchsizes, batch_rows_l): st = time.time() for trial in range(num_trials): output = gpu_computation(image, patch_dims, batchsize, batch_rows, layers, pixels, handle) output = output.get() tot = time.time()-st print "Batchsize {0}".format(batchsize) print "Total time: {0:.4e} seconds".format(tot) print "Time per pixel: {0:.4e} seconds".format(tot/len(pixels*num_trials)) print "Pixels per second: {0:.4e}".format(len(pixels*num_trials)/tot) for end in time_ends: end.synchronize() sgemm_times = map(lambda start, end: end.time_since(start)/1000, time_starts, time_ends) tot_sgemm_time = sum(sgemm_times) print "Total sgemm time: {0:.4e} seconds\nTotal gflop: {1:.4e}\nGflops: {2:.4e}".format(tot_sgemm_time, sgemm_gflop, sgemm_gflop/tot_sgemm_time) #Uncomment to compare results of gpu and pylearn2 classifications #output = output.reshape(1024-39, 1024-39) print output, p_output print np.allclose(p_output[0], output, rtol=1e-04, atol=1e-07) cublas.cublasDestroy(handle) return
def main(): m = 64; k = 512; n = 400; #m = 2; k = 3; n = 4; handle = cublas.cublasCreate() _, narrays, batchsize = sys.argv narrays = int(narrays); batchsize = int(batchsize); cols = []; kernels = []; biases = []; pcols = []; pkernels = []; pbiases= []; #lists to stores pointers to gpu arrays kernel = np.float32((np.random.rand(m, k) -.5) * 2) kernel = np.float32(np.reshape(np.arange(0, m*k, 1), [m, k])) for i in range(narrays): col = np.float32((np.random.rand(k, n) - .5) * 2) #col = np.float32(np.reshape(np.arange(0, k*n, 1), [k, n])) bias = np.float32(np.zeros((m, n))) col_d = gpu.to_gpu(col) kernel_d = gpu.to_gpu(kernel) bias_d = gpu.to_gpu(bias) cols.append(col_d); kernels.append(kernel_d); biases.append(bias_d); pcols.append(col_d.ptr); pkernels.append(kernel_d.ptr); pbiases.append(bias_d.ptr); pcols = np.array(pcols); pkernels = np.array(pkernels); pbiases = np.array(pbiases); pcols_d = gpu.to_gpu(pcols); pkernels_d = gpu.to_gpu(pkernels); pbiases_d = gpu.to_gpu(pbiases); for i in range(narrays): compute_sgemm(cols[i], kernels[i], biases[i], 0, handle); #zero out arrays for checking results #for i in range(narrays): #print biases[i] # biases[i] -= biases[i] print "\n\n" for i in range((narrays+batchsize-1)/batchsize): start = i*batchsize compute_sgemm_batched(pcols_d[start:start+batchsize], pkernels_d[start:start+batchsize], pbiases_d[start:start+batchsize], m, k, n, 0, handle) #for i in range(narrays): # print biases[i] cublas.cublasDestroy(handle)
ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, repeat=repeat) cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta, repeat=repeat) partial1 = ng.empty((devC1.shape[0], 1), dtype=np.float32) partial2 = partial1[0:1, 0:1] diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0, 0] mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0, 0] #if diff > .1: print "Error: %.3f%%" % (100 * diff / mean) print "--------------------------------------------------------------------------------" cublas.cublasDestroy(handle)
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh): GA = [] for An in A: if An is None: GA.append(None) else: GAn = [] for Ans in An: GAn.append(garr.to_gpu(Ans)) GA.append(GAn) GA.append(None) Gl = [] Gl_s = [] Gl_si = [] for n in range(len(l)): if l[n] is None: Gl.append(None) Gl_s.append(None) Gl_si.append(None) else: Gl.append(garr.to_gpu(sp.asarray( l[n]))) #TODO: Support special types... Gl_s.append(garr.to_gpu(sp.asarray(l_s[n]))) Gl_si.append(garr.to_gpu(sp.asarray(l_si[n]))) Gl.append(None) Gl_s.append(None) Gl_si.append(None) Gr = [] Gr_s = [] Gr_si = [] for n in range(len(r)): if r[n] is None: Gr.append(None) Gr_s.append(None) Gr_si.append(None) else: Gr.append(garr.to_gpu(sp.asarray( r[n]))) #TODO: Support special types... Gr_s.append(garr.to_gpu(sp.asarray(r_s[n]))) Gr_si.append(garr.to_gpu(sp.asarray(r_si[n]))) Gr.append(None) Gr_s.append(None) Gr_si.append(None) GK = [] for n in range(len(K)): if K[n] is None: GK.append(None) else: GK.append(garr.to_gpu(sp.asarray(K[n]))) GK.append(None) GVsh = [] for n in range(len(Vsh)): if Vsh[n] is None: GVsh.append(None) else: GVshn = [] for s in range(Vsh[n].shape[0]): GVshn.append(garr.to_gpu(Vsh[n][s])) GVsh.append(GVshn) GC = [] for n in range(len(C)): if C[n] is None: GC.append(None) else: GCn = [] for s in range(C[n].shape[0]): GCns = [] for t in range(C[n].shape[1]): GCns.append(garr.to_gpu(C[n][s, t])) GCn.append(GCns) GC.append(GCn) GC.append(None) GCts = [] for n in range(len(GC)): if GC[n] is None: GCts.append(None) else: GCtsn = [] for t in range(len(GC[n])): GCtsns = [] for s in range(len(GC[n][0])): GCtsns.append(GC[n][s][t]) GCtsn.append(GCtsns) GCts.append(GCtsn) hdl = cb.cublasCreate() num_strms = 10 curr_stream = cb.cublasGetStream(hdl) sites_per_strm = max((N) // num_strms, 1) #print "sites_per_stream = ", sites_per_strm strms = [] for i in range(N // sites_per_strm): strms.append(cd.Stream()) GB = [None] for n in range(1, N + 1): if (n - 1) % sites_per_strm == 0: #print n #print "strm = ", (n - 1) // sites_per_strm cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle) if not Vsh[n] is None: if n > 1: Glm2 = Gl[n - 2] else: Glm2 = None Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n], GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl) GBn = [] for s in range(A[n].shape[0]): GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl) GBns = cla.dot(GBns, Gr_si[n], handle=hdl) GBn.append(GBns) GB.append(GBn) else: GB.append(None) cb.cublasSetStream(hdl, curr_stream) cb.cublasDestroy(hdl) B = [None] for n in range(1, N + 1): if GB[n] is None: B.append(None) else: Bn = sp.empty_like(A[n]) for s in range(A[n].shape[0]): Bn[s] = GB[n][s].get() B.append(Bn) return B
def close_cuda(self): if not self.hdl is None: cb.cublasDestroy(self.hdl) self.hdl = None
glops = max(glops16, glops32, glops64, glops128) if glops16 == glops: fastest = 16 elif glops32 == glops: fastest = 32 elif glops64 == glops: fastest = 64 else: fastest = 128 glopsref = cublas_dot(devA2, devB2, devC2, repeat=repeat) partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32) partial2 = partial1[0:1,0:1] diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0] mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0] flops_diff = glops - glopsref note = "**************" if flops_diff <= 0 else "" print "Faster: %.0f gflops Choice: %d Error: %.3f%%%s" % (flops_diff, fastest, 100 * diff / mean, note) print "--------------------------------------------------------------------------------" cublas.cublasDestroy(handle)
def gpu_computation(image, kernels, biases, max_sizes, soft_weights, soft_bias, batches, window_sizes): nbatches = len(batches) batchsize = len(batches[0]) npixels = nbatches*batchsize layers = len(kernels) handle = cublas.cublasCreate() results = [] result_ps = [] pad = 0; stride = 1; full_image_d = gpu.to_gpu(image) image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s = compute_dims(image, kernels, biases, max_sizes, batchsize, window_sizes, pad, stride) b_result = []; b_offsets_d = []; kernels_d = []; cols = []; col_ps = []; biases_d = []; sgemm_biases = []; sgemm_biases_ps = []; outputs = []; for layer_n, (bias, kernel, sgemm_dim, im_dim, out_dim, max_ksize, ksize, kchannels) in enumerate(zip(biases, kernels, sgemm_dims, image_dims, out_dims, max_sizes, ksizes, kchannels_s)): col = gpu.empty((batchsize, sgemm_dim[1], sgemm_dim[2]), np.float32) cols.append(col) col_ps.append([col[idx, :, :].ptr for idx in range(batchsize)]) #reuse the same kernels for every pixel kernel_d = gpu.to_gpu(kernel) kernel_d = kernel_d.reshape(kchannels, ksize*ksize*im_dim[2]) kernels_d.append(kernel_d) #contain the actual data of the biases bias = bias.reshape(1, bias.shape[2], bias.shape[0]*bias.shape[1]) batch_bias = np.tile(bias, (batchsize, 1, 1)) batch_bias_d = gpu.to_gpu(batch_bias) biases_d.append(batch_bias_d) #scratch space to copy biases to and then write output of sgemm to sgemm_bias = gpu.empty(batch_bias.shape, np.float32) sgemm_biases.append(sgemm_bias) sgemm_biases_ps.append([sgemm_bias[idx, :, :].ptr for idx in range(batchsize)]) #space for output of maxpool output = gpu.empty((batchsize, out_dim[2], out_dim[0], out_dim[1]), np.float32) outputs.append(output) #space for final output classes = gpu.empty(npixels, np.float32) soft_weights_d = gpu.to_gpu(soft_weights) soft_bias = soft_bias.reshape(1, soft_bias.shape[0]) soft_bias_d = gpu.to_gpu(np.ascontiguousarray(np.reshape(np.tile(soft_bias, (batchsize, 1)), (2, batchsize)))) soft_bias_scratch = gpu.empty((soft_bias_d.shape[0], soft_bias_d.shape[1]), np.float32) col_ps_d = gpu.to_gpu(np.array(col_ps)) kernel_ps = map(lambda x: [x.ptr]*batchsize, kernels_d) kernel_ps_d = gpu.to_gpu(np.array(kernel_ps)) sgemm_biases_ps_d = gpu.to_gpu(np.array(sgemm_biases_ps)) for batch in batches: offsets = comp_offsets(batch, full_image_d) offsets_d = gpu.to_gpu(np.int32(np.array(offsets))) b_offsets_d.append(offsets_d); #space to hold final result of each layer result = gpu.empty((out_dims[layers-1][2], out_dims[layers-1][0], out_dims[layers-1][1]), np.float32) b_result.append(result) for batchn, (batch, offsets_d, result) in enumerate(zip(batches, b_offsets_d, b_result)): image_d = full_image_d for layer_n, (im_dim, col_dim, kdim, bias_dim, sgemm_dim, out_dim, ksize, kchannels, max_size) in enumerate(zip(image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s, max_sizes)): sgemm_bias = sgemm_biases[layer_n] cu.memcpy_dtod(sgemm_bias.ptr, biases_d[layer_n].ptr, sgemm_bias.nbytes) im2col_gpu.compute_im2col_batched(image_d, im_dim[0], im_dim[1], im_dim[2], np.int32(ksize), np.int32(pad), np.int32(stride), offsets_d, layer_n, batchsize, cols[layer_n]) compute_sgemm_batched(col_ps_d[layer_n], kernel_ps_d[layer_n], sgemm_biases_ps_d[layer_n], handle, sgemm_dim[0], sgemm_dim[1], sgemm_dim[2]) sgemm_bias = sgemm_bias.reshape(np.int32(batchsize), np.int32(kchannels), col_dim[0], col_dim[1]) maxpool_gpu.compute_max_batched(sgemm_bias, outputs[layer_n], np.int32(max_size)) image_d = outputs[layer_n] result = outputs[layers-1] result = result.reshape(result.shape[0], result.shape[1]*result.shape[2]*result.shape[3]) cu.memcpy_dtod(soft_bias_scratch.ptr, soft_bias_d.ptr, soft_bias_d.nbytes) np_soft_weights = soft_weights_d.get() np_result = result.get() compute_sgemm(soft_weights_d, result, soft_bias_scratch, handle) offset = batchn*batchsize soft_max_in = soft_bias_scratch soft_max.compute_soft_max(soft_max_in, classes, offset) result_ps.append(result) cublas.cublasDestroy(handle) return classes
def destroy_cublas(): cublas.cublasDestroy(handle)
def destroy(self): if self.handle is not None: cublas.cublasDestroy(self.handle)
def tearDown(self): cublas.cublasDestroy(self.cublas_handle)
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh): GA = [] for An in A: if An is None: GA.append(None) else: GAn = [] for Ans in An: GAn.append(garr.to_gpu(Ans)) GA.append(GAn) GA.append(None) Gl = [] Gl_s = [] Gl_si = [] for n in range(len(l)): if l[n] is None: Gl.append(None) Gl_s.append(None) Gl_si.append(None) else: Gl.append(garr.to_gpu(sp.asarray(l[n]))) #TODO: Support special types... Gl_s.append(garr.to_gpu(sp.asarray(l_s[n]))) Gl_si.append(garr.to_gpu(sp.asarray(l_si[n]))) Gl.append(None) Gl_s.append(None) Gl_si.append(None) Gr = [] Gr_s = [] Gr_si = [] for n in range(len(r)): if r[n] is None: Gr.append(None) Gr_s.append(None) Gr_si.append(None) else: Gr.append(garr.to_gpu(sp.asarray(r[n]))) #TODO: Support special types... Gr_s.append(garr.to_gpu(sp.asarray(r_s[n]))) Gr_si.append(garr.to_gpu(sp.asarray(r_si[n]))) Gr.append(None) Gr_s.append(None) Gr_si.append(None) GK = [] for n in range(len(K)): if K[n] is None: GK.append(None) else: GK.append(garr.to_gpu(sp.asarray(K[n]))) GK.append(None) GVsh = [] for n in range(len(Vsh)): if Vsh[n] is None: GVsh.append(None) else: GVshn = [] for s in range(Vsh[n].shape[0]): GVshn.append(garr.to_gpu(Vsh[n][s])) GVsh.append(GVshn) GC = [] for n in range(len(C)): if C[n] is None: GC.append(None) else: GCn = [] for s in range(C[n].shape[0]): GCns = [] for t in range(C[n].shape[1]): GCns.append(garr.to_gpu(C[n][s, t])) GCn.append(GCns) GC.append(GCn) GC.append(None) GCts = [] for n in range(len(GC)): if GC[n] is None: GCts.append(None) else: GCtsn = [] for t in range(len(GC[n])): GCtsns = [] for s in range(len(GC[n][0])): GCtsns.append(GC[n][s][t]) GCtsn.append(GCtsns) GCts.append(GCtsn) hdl = cb.cublasCreate() num_strms = 10 curr_stream = cb.cublasGetStream(hdl) sites_per_strm = max((N) // num_strms, 1) #print "sites_per_stream = ", sites_per_strm strms = [] for i in range(N // sites_per_strm): strms.append(cd.Stream()) GB = [None] for n in range(1, N + 1): if (n - 1) % sites_per_strm == 0: #print n #print "strm = ", (n - 1) // sites_per_strm cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle) if not Vsh[n] is None: if n > 1: Glm2 = Gl[n - 2] else: Glm2 = None Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n], GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl) GBn = [] for s in range(A[n].shape[0]): GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl) GBns = cla.dot(GBns, Gr_si[n], handle=hdl) GBn.append(GBns) GB.append(GBn) else: GB.append(None) cb.cublasSetStream(hdl, curr_stream) cb.cublasDestroy(hdl) B = [None] for n in range(1, N + 1): if GB[n] is None: B.append(None) else: Bn = sp.empty_like(A[n]) for s in range(A[n].shape[0]): Bn[s] = GB[n][s].get() B.append(Bn) return B