def transpose(a): ''' https://github.com/lebedov/scikit-cuda/issues/33 pip install --upgrade --no-deps git+https://github.com/lebedov/scikits.cuda.git :return: ''' import time import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import scikits.cuda.cublas as cublas handle = cublas.cublasCreate() # N = 1000 # a = np.random.rand(N, N) R = a.shape[0] C = a.shape[1] a_gpu = gpuarray.to_gpu(a) a_trans_gpu = gpuarray.zeros((C, R), dtype=np.double) alpha = 1.0 beta = 0.0 start = time.time() cublas.cublasDgeam(handle, 't', 'n', R, R, alpha, a_gpu.gpudata, R, beta, a_gpu.gpudata, R, a_trans_gpu.gpudata, R) print time.time()-start # assert np.allclose(a_trans_gpu.get(), a.T) cublas.cublasDestroy(handle) return a_trans_gpu
def calc_x(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh): handle = cb.cublasCreate() if not rp1 is None: rp1 = garr.to_gpu(sp.asarray(rp1)) if not lm2 is None: lm2 = garr.to_gpu(sp.asarray(lm2)) lm1_s = garr.to_gpu(sp.asarray(lm1_s)) lm1_si = garr.to_gpu(sp.asarray(lm1_si)) r_s = garr.to_gpu(sp.asarray(r_s)) r_si = garr.to_gpu(sp.asarray(r_si)) A = list(map(garr.to_gpu, A)) if not Am1 is None: Am1 = list(map(garr.to_gpu, Am1)) if not Ap1 is None: Ap1 = list(map(garr.to_gpu, Ap1)) Vsh = list(map(garr.to_gpu, Vsh)) if not Cm1 is None: Cm1 = [[garr.to_gpu(Cm1[t, s]) for t in range(Cm1.shape[1])] for s in range(Cm1.shape[0])] if not (C is None and Kp1 is None): C = [[garr.to_gpu(C[s, t]) for t in range(C.shape[1])] for s in range(C.shape[0])] Kp1 = garr.to_gpu(Kp1) x = calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=handle) cb.cublasDestroy(handle) return x.get()
def classify(image_names, model_file_name, output_names): """ Classify a set of images using the given model. Parameters ---------- image_names : iterable of strings names of the input images model_file_name : string name of the file containing the model output_names : iterable of strings names of the output images Notes ----- image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx] """ handle = cublas.cublasCreate() model = serial.load(model_file_name) outputs = [] for image_name, output_name in zip(image_names, output_names): image = load_image(image_name) output = classify_image(image, model, handle) save_image(np.int32(np.round(output*255)), output_name) cublas.cublasDestroy(handle)
def __init__(self, A1, A2, left, use_batch=False): """Creates a new LinearOperator interface to the superoperator E. This is a wrapper to be used with SciPy's sparse linear algebra routines. Parameters ---------- A1 : ndarray Ket parameter tensor. A2 : ndarray Bra parameter tensor. left : bool Whether to multiply with a vector to the left (or to the right). """ self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.use_batch = use_batch self.left = left self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = sp.dtype(A1[0][0].dtype) self.calls = 0 self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.ones = [ garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0])) ] self.ones = [one.fill(1) for one in self.ones] self.zeros = [ garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0])) ] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def _initialize_cublas(): global sgemm try: cublas.cublasInit() sgemm = cublas.cublasSgemm except AttributeError: handle = cublas.cublasCreate() def sgemm(*args): cublas.cublasSgemm(handle, *args)
def __init__(self, A1, A2, left, use_batch=False): """Creates a new LinearOperator interface to the superoperator E. This is a wrapper to be used with SciPy's sparse linear algebra routines. Parameters ---------- A1 : ndarray Ket parameter tensor. A2 : ndarray Bra parameter tensor. left : bool Whether to multiply with a vector to the left (or to the right). """ self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.use_batch = use_batch self.left = left self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = sp.dtype(A1[0][0].dtype) self.calls = 0 self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.ones = [one.fill(1) for one in self.ones] self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def _initialize_cublas(): global sgemm util.log_info('Initializing cublas.') try: cublas.cublasInit() sgemm = cublas.cublasSgemm except AttributeError: handle = cublas.cublasCreate() def sgemm(*args): cublas.cublasSgemm(handle, *args)
def __init__(self, name, gpu_id): """ name: name of the node, can be any arbitrary string gpu_id: the integer id of the GPU that this Node should be running on """ Node.__init__(self, name) self.ctx = driver.Device(gpu_id).make_context() self.device = self.ctx.get_device() print 'Executing on device at PCI ID:', self.device.pci_bus_id() self.handle = cublas.cublasCreate() self.gpu_id = gpu_id self.is_cpu = False self.is_gpu = True
def classify(image_names, model_file_name, output_names): """ Classify a set of images using the given model. Parameters ---------- image_names : iterable of strings names of the input images model_file_name : string name of the file containing the model output_names : iterable of strings names of the output images Notes ----- image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx] This network copies the weights to the gpu once to classify all the images as it should. This can be used as a model to make the same change to the fully connected network. """ handle = cublas.cublasCreate() model = serial.load(model_file_name) layers = model.layers convs = layers[:-1]; softmax = layers[-1]; convs = map(lambda layer: layer.get_params(), convs) kernels = map(lambda layer: np.array(layer[0].eval()), convs) #This can be simplified kernels = map(lambda kernel: np.ascontiguousarray(np.rollaxis(kernel, 0, 3)), kernels) kdims = map(lambda kernel: kernel.shape, kernels) kernels = map(lambda layer: layer[0].dimshuffle(3, 0, 1, 2).eval(), convs) kernels = map(lambda kernel, kdim: kernel.reshape(kdim), kernels, kdims) biases = map(lambda layer: np.array(layer[1].eval()), convs) bias_dims = map(lambda bias: bias.shape, biases) max_sizes = map(lambda layer: layer.pool_shape + [layer.num_pieces], layers[:-1]) weights = softmax.get_params()[1]; bias = softmax.get_params()[0]; soft_weights = softmax.get_params()[1].reshape((3, 3, 32, 2)).dimshuffle(3, 2, 0, 1).eval() soft_weights = np.ascontiguousarray(np.reshape(soft_weights, (2, 288)).transpose()) soft_bias = softmax.get_params()[0].get_value()[::1] window = layers[0].input_space.shape outputs = [] for image_name, output_name in zip(image_names, output_names): image = load_image(image_name) output = classify_image(image, model, kernels, biases, max_sizes, soft_weights, soft_bias, window, handle) save_image(np.int8(np.round(output*255)), output_name) cublas.cublasDestroy(handle)
def __init__(self, p, A1, A2, l=None, r=None, left=False, pseudo=True, use_batch=False): assert not (pseudo and (l is None or r is None)), 'For pseudo-inverse l and r must be set!' self.use_batch = use_batch self.p = p self.left = left self.pseudo = pseudo self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = A1[0].dtype self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.l = l self.r = r self.lG = garr.to_gpu(sp.asarray(l)) self.rG = garr.to_gpu(sp.asarray(r)) self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.out2 = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) self.out2_p = get_batch_ptrs([self.out2] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.out2_p = None self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.ones = [one.fill(1) for one in self.ones] self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def make_thunk(self, node, storage_map, _, _2): inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] num_streams = 32 # 32 handle = [cublas.cublasCreate()] stream_pool = [pycuda.driver.Stream() for _ in xrange(num_streams)] current_stream = [0] def thunk(): x = inputs[0] y = inputs[1] # chop off the real/imag dimension input_shape_x = x[0].shape # (a, b, 2) input_shape_y = y[0].shape # (b, c, 2) output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2) input_x_pycuda = to_complex_gpuarray(x[0]) input_y_pycuda = to_complex_gpuarray(y[0]) # multistream experiment # print "DEBUG: Setting stream to %d" % current_stream[0] # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams] # print "PREV STREAM IS DONE?" # print prev_stream_obj.is_done() # print stream_obj = stream_pool[current_stream[0]] cublas.cublasSetStream(handle[0], stream_obj.handle) current_stream[0] += 1 current_stream[0] %= num_streams # print "DEBUG: set next stream id to %d" % current_stream[0] output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda, handle=handle[0]) outputs[0][0] = to_complex_cudandarray(output_pycuda) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
def get_cublas_handle(): """Gets CUBLAS handle for the current device. Returns: CUBLAS handle. """ global _cublas_handles device = Context.get_device() if device in _cublas_handles: return _cublas_handles[device] handle = cublas.cublasCreate() _cublas_handles[device] = handle return handle
def make_thunk(self, node, storage_map, _, _2): inputs = [ storage_map[v] for v in node.inputs] outputs = [ storage_map[v] for v in node.outputs] num_streams = 32 # 32 handle = [cublas.cublasCreate()] stream_pool = [pycuda.driver.Stream() for _ in xrange(num_streams)] current_stream = [0] def thunk(): x = inputs[0] y = inputs[1] # chop off the real/imag dimension input_shape_x = x[0].shape # (a, b, 2) input_shape_y = y[0].shape # (b, c, 2) output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2) input_x_pycuda = to_complex_gpuarray(x[0]) input_y_pycuda = to_complex_gpuarray(y[0]) # multistream experiment # print "DEBUG: Setting stream to %d" % current_stream[0] # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams] # print "PREV STREAM IS DONE?" # print prev_stream_obj.is_done() # print stream_obj = stream_pool[current_stream[0]] cublas.cublasSetStream(handle[0], stream_obj.handle) current_stream[0] += 1 current_stream[0] %= num_streams # print "DEBUG: set next stream id to %d" % current_stream[0] output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda, handle=handle[0]) outputs[0][0] = to_complex_cudandarray(output_pycuda) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
def test_with_test_data(): handle = cublas.cublasCreate() # 3 vars model + constant XY = np.loadtxt(open("TestData/Y=2X1+3X2+4X3+5_valid_predictors.csv", "rb"), delimiter=",", skiprows=1, dtype=np.float32) print "3 vars model, B coefficients" print calculate_single_regression(handle, XY) # 2 vars model XY = np.loadtxt(open("TestData/Y=2X1+3X2+5.csv", "rb"), delimiter=",", skiprows=1, dtype=np.float32) print "2 vars model, B coefficients" print calculate_single_regression(handle, XY)
def main(): """ For testing and timing. """ handle = cublas.cublasCreate() image = np.float32((np.random.rand(1024, 1024) - .5) * 2) model = serial.load(model_file_name) layers = model.layers patch_dims = (39, 39) #There is a bug that occurs if running with too long a batch_rows_l #Most likely a memory allocation issue that is not being reported correctly batch_rows_l = [8] batchsizes = map(lambda x: x*(1024-39+1), batch_rows_l) pixels = [(x, y) for x in range(1024-39+1) for y in range(1024-39+1)] #Uncomment to use pylearn2 to classify to check result p_output = pylearn2_computation(model, image, patch_dims, batchsizes[0], pixels) p_output = np.transpose(p_output) num_trials = 1 for batchsize, batch_rows in zip(batchsizes, batch_rows_l): st = time.time() for trial in range(num_trials): output = gpu_computation(image, patch_dims, batchsize, batch_rows, layers, pixels, handle) output = output.get() tot = time.time()-st print "Batchsize {0}".format(batchsize) print "Total time: {0:.4e} seconds".format(tot) print "Time per pixel: {0:.4e} seconds".format(tot/len(pixels*num_trials)) print "Pixels per second: {0:.4e}".format(len(pixels*num_trials)/tot) for end in time_ends: end.synchronize() sgemm_times = map(lambda start, end: end.time_since(start)/1000, time_starts, time_ends) tot_sgemm_time = sum(sgemm_times) print "Total sgemm time: {0:.4e} seconds\nTotal gflop: {1:.4e}\nGflops: {2:.4e}".format(tot_sgemm_time, sgemm_gflop, sgemm_gflop/tot_sgemm_time) #Uncomment to compare results of gpu and pylearn2 classifications #output = output.reshape(1024-39, 1024-39) print output, p_output print np.allclose(p_output[0], output, rtol=1e-04, atol=1e-07) cublas.cublasDestroy(handle) return
def __init__(self): culinalg.init() self.handle = cublas.cublasCreate() self._elem_kernel = culinalg_kernel.get_function('_elem') self._sigmoid_kernel = culinalg_kernel.get_function('_sigmoid') self._log_anti_sigmoid_kernel = culinalg_kernel.get_function('_log_anti_sigmoid') self._tanh_kernel = culinalg_kernel.get_function('_tanh') self._pow_kernel = culinalg_kernel.get_function('_pow') self._sqrt_kernel = culinalg_kernel.get_function('_sqrt') self._square_kernel = culinalg_kernel.get_function('_square') self._exp_kernel = culinalg_kernel.get_function('_exp') self._log_kernel = culinalg_kernel.get_function('_log') self._sum_kernel = culinalg_kernel.get_function('_sum') self._compare_kernel = culinalg_kernel.get_function('_compare') self._reverse_kernel = culinalg_kernel.get_function('_reverse') self.X_max_kernel = culinalg_kernel.get_function('X_max') self.X_min_kernel = culinalg_kernel.get_function('X_min') self.X_sum_kernel = culinalg_kernel.get_function('X_sum') self.X_norm_kernel = culinalg_kernel.get_function('X_norm') self.s_mul_x_kernel = culinalg_kernel.get_function('s_mul_x') self.s_add_x_kernel = culinalg_kernel.get_function('s_add_x') self.x_add_y_kernel = culinalg_kernel.get_function('x_add_y') self.X_add_Y_kernel = culinalg_kernel.get_function('X_add_Y') self.x_mul_y_kernel = culinalg_kernel.get_function('x_mul_y') self.X_mul_Y_kernel = culinalg_kernel.get_function('X_mul_Y') self.x_div_y_kernel = culinalg_kernel.get_function('x_div_y') self.X_div_Y_kernel = culinalg_kernel.get_function('X_div_Y') self.x_radd_Y_as_Y_kernel = culinalg_kernel.get_function('x_radd_Y_as_Y') self.x_cadd_Y_as_Y_kernel = culinalg_kernel.get_function('x_cadd_Y_as_Y') self.x_rmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_rmul_Y_as_Y') self.x_cmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_cmul_Y_as_Y') self.x_radd_Y_as_x_kernel = culinalg_kernel.get_function('x_radd_Y_as_x') self.x_cadd_Y_as_x_kernel = culinalg_kernel.get_function('x_cadd_Y_as_x') self.x_outer_y_add_O_kernel = culinalg_kernel.get_function('x_outer_y_add_O') self.X_router_Y_add_O_kernel = culinalg_kernel.get_function('X_router_Y_add_O') self.X_rdot_Y_kernel = culinalg_kernel.get_function('X_rdot_Y') self.index_to_array_kernel = culinalg_kernel.get_function('index_to_array') self._2d_block = (32, 32, 1) self._1d_block = (1024, 1, 1) self._3d_block = (16, 16, 4)
def main(): m = 64; k = 512; n = 400; #m = 2; k = 3; n = 4; handle = cublas.cublasCreate() _, narrays, batchsize = sys.argv narrays = int(narrays); batchsize = int(batchsize); cols = []; kernels = []; biases = []; pcols = []; pkernels = []; pbiases= []; #lists to stores pointers to gpu arrays kernel = np.float32((np.random.rand(m, k) -.5) * 2) kernel = np.float32(np.reshape(np.arange(0, m*k, 1), [m, k])) for i in range(narrays): col = np.float32((np.random.rand(k, n) - .5) * 2) #col = np.float32(np.reshape(np.arange(0, k*n, 1), [k, n])) bias = np.float32(np.zeros((m, n))) col_d = gpu.to_gpu(col) kernel_d = gpu.to_gpu(kernel) bias_d = gpu.to_gpu(bias) cols.append(col_d); kernels.append(kernel_d); biases.append(bias_d); pcols.append(col_d.ptr); pkernels.append(kernel_d.ptr); pbiases.append(bias_d.ptr); pcols = np.array(pcols); pkernels = np.array(pkernels); pbiases = np.array(pbiases); pcols_d = gpu.to_gpu(pcols); pkernels_d = gpu.to_gpu(pkernels); pbiases_d = gpu.to_gpu(pbiases); for i in range(narrays): compute_sgemm(cols[i], kernels[i], biases[i], 0, handle); #zero out arrays for checking results #for i in range(narrays): #print biases[i] # biases[i] -= biases[i] print "\n\n" for i in range((narrays+batchsize-1)/batchsize): start = i*batchsize compute_sgemm_batched(pcols_d[start:start+batchsize], pkernels_d[start:start+batchsize], pbiases_d[start:start+batchsize], m, k, n, 0, handle) #for i in range(narrays): # print biases[i] cublas.cublasDestroy(handle)
def setUp(self): self.cublas_handle = cublas.cublasCreate()
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from scikits.cuda import cublas from ipdb import set_trace print context.get_device().name() handle = cublas.cublasCreate() start, end = (drv.Event(), drv.Event()) def cublas_dot(A, B, C, repeat=1): lda = max(A.strides) // 4 ldb = max(B.strides) // 4 ldc = max(C.strides) // 4 opA = 't' if A.is_trans else 'n' opB = 't' if B.is_trans else 'n' op = opB + opA m = A.shape[0] n = B.shape[1]
def gpu_computation(image, kernels, biases, max_sizes, soft_weights, soft_bias, batches, window_sizes): nbatches = len(batches) batchsize = len(batches[0]) npixels = nbatches*batchsize layers = len(kernels) handle = cublas.cublasCreate() results = [] result_ps = [] pad = 0; stride = 1; full_image_d = gpu.to_gpu(image) image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s = compute_dims(image, kernels, biases, max_sizes, batchsize, window_sizes, pad, stride) b_result = []; b_offsets_d = []; kernels_d = []; cols = []; col_ps = []; biases_d = []; sgemm_biases = []; sgemm_biases_ps = []; outputs = []; for layer_n, (bias, kernel, sgemm_dim, im_dim, out_dim, max_ksize, ksize, kchannels) in enumerate(zip(biases, kernels, sgemm_dims, image_dims, out_dims, max_sizes, ksizes, kchannels_s)): col = gpu.empty((batchsize, sgemm_dim[1], sgemm_dim[2]), np.float32) cols.append(col) col_ps.append([col[idx, :, :].ptr for idx in range(batchsize)]) #reuse the same kernels for every pixel kernel_d = gpu.to_gpu(kernel) kernel_d = kernel_d.reshape(kchannels, ksize*ksize*im_dim[2]) kernels_d.append(kernel_d) #contain the actual data of the biases bias = bias.reshape(1, bias.shape[2], bias.shape[0]*bias.shape[1]) batch_bias = np.tile(bias, (batchsize, 1, 1)) batch_bias_d = gpu.to_gpu(batch_bias) biases_d.append(batch_bias_d) #scratch space to copy biases to and then write output of sgemm to sgemm_bias = gpu.empty(batch_bias.shape, np.float32) sgemm_biases.append(sgemm_bias) sgemm_biases_ps.append([sgemm_bias[idx, :, :].ptr for idx in range(batchsize)]) #space for output of maxpool output = gpu.empty((batchsize, out_dim[2], out_dim[0], out_dim[1]), np.float32) outputs.append(output) #space for final output classes = gpu.empty(npixels, np.float32) soft_weights_d = gpu.to_gpu(soft_weights) soft_bias = soft_bias.reshape(1, soft_bias.shape[0]) soft_bias_d = gpu.to_gpu(np.ascontiguousarray(np.reshape(np.tile(soft_bias, (batchsize, 1)), (2, batchsize)))) soft_bias_scratch = gpu.empty((soft_bias_d.shape[0], soft_bias_d.shape[1]), np.float32) col_ps_d = gpu.to_gpu(np.array(col_ps)) kernel_ps = map(lambda x: [x.ptr]*batchsize, kernels_d) kernel_ps_d = gpu.to_gpu(np.array(kernel_ps)) sgemm_biases_ps_d = gpu.to_gpu(np.array(sgemm_biases_ps)) for batch in batches: offsets = comp_offsets(batch, full_image_d) offsets_d = gpu.to_gpu(np.int32(np.array(offsets))) b_offsets_d.append(offsets_d); #space to hold final result of each layer result = gpu.empty((out_dims[layers-1][2], out_dims[layers-1][0], out_dims[layers-1][1]), np.float32) b_result.append(result) for batchn, (batch, offsets_d, result) in enumerate(zip(batches, b_offsets_d, b_result)): image_d = full_image_d for layer_n, (im_dim, col_dim, kdim, bias_dim, sgemm_dim, out_dim, ksize, kchannels, max_size) in enumerate(zip(image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s, max_sizes)): sgemm_bias = sgemm_biases[layer_n] cu.memcpy_dtod(sgemm_bias.ptr, biases_d[layer_n].ptr, sgemm_bias.nbytes) im2col_gpu.compute_im2col_batched(image_d, im_dim[0], im_dim[1], im_dim[2], np.int32(ksize), np.int32(pad), np.int32(stride), offsets_d, layer_n, batchsize, cols[layer_n]) compute_sgemm_batched(col_ps_d[layer_n], kernel_ps_d[layer_n], sgemm_biases_ps_d[layer_n], handle, sgemm_dim[0], sgemm_dim[1], sgemm_dim[2]) sgemm_bias = sgemm_bias.reshape(np.int32(batchsize), np.int32(kchannels), col_dim[0], col_dim[1]) maxpool_gpu.compute_max_batched(sgemm_bias, outputs[layer_n], np.int32(max_size)) image_d = outputs[layer_n] result = outputs[layers-1] result = result.reshape(result.shape[0], result.shape[1]*result.shape[2]*result.shape[3]) cu.memcpy_dtod(soft_bias_scratch.ptr, soft_bias_d.ptr, soft_bias_d.nbytes) np_soft_weights = soft_weights_d.get() np_result = result.get() compute_sgemm(soft_weights_d, result, soft_bias_scratch, handle) offset = batchn*batchsize soft_max_in = soft_bias_scratch soft_max.compute_soft_max(soft_max_in, classes, offset) result_ps.append(result) cublas.cublasDestroy(handle) return classes
def create(self): if self.handle is None: self.handle = cublas.cublasCreate()
def __init__(self, p, A1, A2, l=None, r=None, left=False, pseudo=True, use_batch=False): assert not (pseudo and (l is None or r is None)), 'For pseudo-inverse l and r must be set!' self.use_batch = use_batch self.p = p self.left = left self.pseudo = pseudo self.D = A1[0].shape[1] self.shape = (self.D**2, self.D**2) self.dtype = A1[0].dtype self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1] self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2] self.tmp = list(map(garr.empty_like, self.A1G[0])) self.tmp2 = list(map(garr.empty_like, self.A1G[0])) self.l = l self.r = r self.lG = garr.to_gpu(sp.asarray(l)) self.rG = garr.to_gpu(sp.asarray(r)) self.out = garr.empty((self.D, self.D), dtype=self.dtype) self.out2 = garr.empty((self.D, self.D), dtype=self.dtype) self.xG = garr.empty((self.D, self.D), dtype=self.dtype) if use_batch: self.A1G_p = list(map(get_batch_ptrs, self.A1G)) self.A2G_p = list(map(get_batch_ptrs, self.A2G)) self.tmp_p = get_batch_ptrs(self.tmp) self.tmp2_p = get_batch_ptrs(self.tmp2) self.xG_p = get_batch_ptrs([self.xG] * len(A1[0])) self.out_p = get_batch_ptrs([self.out] * len(A1[0])) self.out2_p = get_batch_ptrs([self.out2] * len(A1[0])) else: self.A1G_p = None self.A2G_p = None self.tmp_p = None self.tmp2_p = None self.xG_p = None self.out_p = None self.out2_p = None self.ones = [ garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0])) ] self.ones = [one.fill(1) for one in self.ones] self.zeros = [ garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0])) ] self.streams = [] for s in range(A1[0].shape[0]): self.streams.append(cd.Stream()) self.hdl = cb.cublasCreate()
def setUp(self): np.random.seed(23) # For reproducible tests. self.cublas_handle = cublas.cublasCreate()
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh): GA = [] for An in A: if An is None: GA.append(None) else: GAn = [] for Ans in An: GAn.append(garr.to_gpu(Ans)) GA.append(GAn) GA.append(None) Gl = [] Gl_s = [] Gl_si = [] for n in range(len(l)): if l[n] is None: Gl.append(None) Gl_s.append(None) Gl_si.append(None) else: Gl.append(garr.to_gpu(sp.asarray( l[n]))) #TODO: Support special types... Gl_s.append(garr.to_gpu(sp.asarray(l_s[n]))) Gl_si.append(garr.to_gpu(sp.asarray(l_si[n]))) Gl.append(None) Gl_s.append(None) Gl_si.append(None) Gr = [] Gr_s = [] Gr_si = [] for n in range(len(r)): if r[n] is None: Gr.append(None) Gr_s.append(None) Gr_si.append(None) else: Gr.append(garr.to_gpu(sp.asarray( r[n]))) #TODO: Support special types... Gr_s.append(garr.to_gpu(sp.asarray(r_s[n]))) Gr_si.append(garr.to_gpu(sp.asarray(r_si[n]))) Gr.append(None) Gr_s.append(None) Gr_si.append(None) GK = [] for n in range(len(K)): if K[n] is None: GK.append(None) else: GK.append(garr.to_gpu(sp.asarray(K[n]))) GK.append(None) GVsh = [] for n in range(len(Vsh)): if Vsh[n] is None: GVsh.append(None) else: GVshn = [] for s in range(Vsh[n].shape[0]): GVshn.append(garr.to_gpu(Vsh[n][s])) GVsh.append(GVshn) GC = [] for n in range(len(C)): if C[n] is None: GC.append(None) else: GCn = [] for s in range(C[n].shape[0]): GCns = [] for t in range(C[n].shape[1]): GCns.append(garr.to_gpu(C[n][s, t])) GCn.append(GCns) GC.append(GCn) GC.append(None) GCts = [] for n in range(len(GC)): if GC[n] is None: GCts.append(None) else: GCtsn = [] for t in range(len(GC[n])): GCtsns = [] for s in range(len(GC[n][0])): GCtsns.append(GC[n][s][t]) GCtsn.append(GCtsns) GCts.append(GCtsn) hdl = cb.cublasCreate() num_strms = 10 curr_stream = cb.cublasGetStream(hdl) sites_per_strm = max((N) // num_strms, 1) #print "sites_per_stream = ", sites_per_strm strms = [] for i in range(N // sites_per_strm): strms.append(cd.Stream()) GB = [None] for n in range(1, N + 1): if (n - 1) % sites_per_strm == 0: #print n #print "strm = ", (n - 1) // sites_per_strm cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle) if not Vsh[n] is None: if n > 1: Glm2 = Gl[n - 2] else: Glm2 = None Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n], GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl) GBn = [] for s in range(A[n].shape[0]): GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl) GBns = cla.dot(GBns, Gr_si[n], handle=hdl) GBn.append(GBns) GB.append(GBn) else: GB.append(None) cb.cublasSetStream(hdl, curr_stream) cb.cublasDestroy(hdl) B = [None] for n in range(1, N + 1): if GB[n] is None: B.append(None) else: Bn = sp.empty_like(A[n]) for s in range(A[n].shape[0]): Bn[s] = GB[n][s].get() B.append(Bn) return B
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh): GA = [] for An in A: if An is None: GA.append(None) else: GAn = [] for Ans in An: GAn.append(garr.to_gpu(Ans)) GA.append(GAn) GA.append(None) Gl = [] Gl_s = [] Gl_si = [] for n in range(len(l)): if l[n] is None: Gl.append(None) Gl_s.append(None) Gl_si.append(None) else: Gl.append(garr.to_gpu(sp.asarray(l[n]))) #TODO: Support special types... Gl_s.append(garr.to_gpu(sp.asarray(l_s[n]))) Gl_si.append(garr.to_gpu(sp.asarray(l_si[n]))) Gl.append(None) Gl_s.append(None) Gl_si.append(None) Gr = [] Gr_s = [] Gr_si = [] for n in range(len(r)): if r[n] is None: Gr.append(None) Gr_s.append(None) Gr_si.append(None) else: Gr.append(garr.to_gpu(sp.asarray(r[n]))) #TODO: Support special types... Gr_s.append(garr.to_gpu(sp.asarray(r_s[n]))) Gr_si.append(garr.to_gpu(sp.asarray(r_si[n]))) Gr.append(None) Gr_s.append(None) Gr_si.append(None) GK = [] for n in range(len(K)): if K[n] is None: GK.append(None) else: GK.append(garr.to_gpu(sp.asarray(K[n]))) GK.append(None) GVsh = [] for n in range(len(Vsh)): if Vsh[n] is None: GVsh.append(None) else: GVshn = [] for s in range(Vsh[n].shape[0]): GVshn.append(garr.to_gpu(Vsh[n][s])) GVsh.append(GVshn) GC = [] for n in range(len(C)): if C[n] is None: GC.append(None) else: GCn = [] for s in range(C[n].shape[0]): GCns = [] for t in range(C[n].shape[1]): GCns.append(garr.to_gpu(C[n][s, t])) GCn.append(GCns) GC.append(GCn) GC.append(None) GCts = [] for n in range(len(GC)): if GC[n] is None: GCts.append(None) else: GCtsn = [] for t in range(len(GC[n])): GCtsns = [] for s in range(len(GC[n][0])): GCtsns.append(GC[n][s][t]) GCtsn.append(GCtsns) GCts.append(GCtsn) hdl = cb.cublasCreate() num_strms = 10 curr_stream = cb.cublasGetStream(hdl) sites_per_strm = max((N) // num_strms, 1) #print "sites_per_stream = ", sites_per_strm strms = [] for i in range(N // sites_per_strm): strms.append(cd.Stream()) GB = [None] for n in range(1, N + 1): if (n - 1) % sites_per_strm == 0: #print n #print "strm = ", (n - 1) // sites_per_strm cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle) if not Vsh[n] is None: if n > 1: Glm2 = Gl[n - 2] else: Glm2 = None Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n], GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl) GBn = [] for s in range(A[n].shape[0]): GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl) GBns = cla.dot(GBns, Gr_si[n], handle=hdl) GBn.append(GBns) GB.append(GBn) else: GB.append(None) cb.cublasSetStream(hdl, curr_stream) cb.cublasDestroy(hdl) B = [None] for n in range(1, N + 1): if GB[n] is None: B.append(None) else: Bn = sp.empty_like(A[n]) for s in range(A[n].shape[0]): Bn[s] = GB[n][s].get() B.append(Bn) return B
def _get_cublas(): return cublas.cublasCreate()
from .parallel import get_id_within_node gpuid = get_id_within_node() import pycuda.driver pycuda.driver.init() if gpuid >= pycuda.driver.Device.count(): print '[' + MPI.Get_processor_name( ) + '] more processes than the GPU numbers!' #MPI.COMM_WORLD.Abort() raise gpu_device = pycuda.driver.Device(gpuid) gpu_context = gpu_device.make_context() gpu_initialized = True else: import pycuda.autoinit gpu_initialized = True except: pass try: from scikits.cuda import cublas import scikits.cuda.linalg as culinalg culinalg.init() cublas_handle = cublas.cublasCreate() except: pass def closeGPU(): if gpu_context is not None: gpu_context.detach()
from scikits.cuda import cublas as cubla import libcudnn cublas = cubla.cublasCreate() cudnn = libcudnn.cudnnCreate() print("CUDNN Version: %d" % libcudnn.cudnnGetVersion()) print("CUBLAS Version:", cubla.cublasGetVersion(cublas))