def __init__(self, data, alpha=10, norm_type=1, verbose=False, step=5, thr=[10**-8, -1], max_iter=5000, affine=False, normalize=True, PCA=False, npc=10, GPU=False, device=0): self.data = data self.alpha = alpha self.norm_type = norm_type self.verbose = verbose self.step = step self.thr = thr self.max_iter = max_iter self.affine = affine self.normalize = normalize self.device = device self.PCA = PCA self.npc = npc self.GPU = GPU self.num_rows = data.shape[0] self.num_columns = data.shape[1] if (self.GPU == True): # self.data = self.data.astype('float32') linalg.init()
def initParallelAlgorithms(): global bitonicSort_ fin = open("ParallelAlgorithms/bitonicSort.cu") mod = SourceModule(fin.read()) fin.close() bitonicSort_ = mod.get_function("bitonicSort") global finishCSM_ global getSumSquares_ fin = open("ParallelAlgorithms/CSMHelper.cu") mod = SourceModule(fin.read()) fin.close() finishCSM_ = mod.get_function("finishCSM") getSumSquares_ = mod.get_function("getSumSquares") #Run each of the algorithms on dummy data so that they're pre-compiled #1) Bitonic Sort X = np.random.randn(16, 16) N = np.int32(16) NPow2 = N NThreads = N/2 XG = gpuarray.to_gpu(X) bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4*NPow2) linalg.init() #2) Other primitive operations NegXDotX = linalg.dot(XG, XG) XPlusX = skcuda.misc.add(XG, XG) XSqr = skcuda.misc.multiply(XG, XG) XSqr = skcuda.misc.sum(XSqr, 1) XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
def forward(self, bottom, top): # print 'hanli crf forward -- ' # print 'self.diff.shape: ' + str(self.diff.shape); # self.diff.shape: (batchsize, 65536) # print 'crf bottom[0].data.shape: ' + str(bottom[0].data.shape); #crf bottom[0].data.shape: (batchsize, 11) # print 'raw degree bottom[1].data.shape: ' + str(bottom[1].data.shape); #(batchsize, 65536, 11) # print 'png bottom[2].data.shape: ' + str(bottom[2].data.shape); # (batchsize, 65536) # print 'np.dot(bottom[1].data[i,:,:], bottom[0].data[i,:]).shape: ' + str(np.dot(bottom[1].data[0,:,:], bottom[0].data[0,:]).shape); #(65536,) # print 'bottom[2].data[i,:].shape: ' + str(bottom[2].data[0,:].shape); # (65536,) with pu.caffe_cuda_context(): linalg.init() for i in range(self.diff.shape[0]): #a = bottom[1].data_as_pycuda_gpuarray() #b = bottom[0].data_as_pycuda_gpuarray() a = bottom[1].data[i, :, :].astype(np.float32) b = bottom[0].data[i, :].astype(np.float32) ##a = np.asarray(np.random.rand(4, 4), dtype=np.float32) ##b = np.asarray(np.random.rand(4), dtype=np.float32) #a_gpu = gpuarray.GPUArray(a, dtype=np.float32) #b_gpu = gpuarray.GPUArray(b, dtype=np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) #self.diff[i,:] = c_gpu + bottom[2].data[i,:] - bottom[3].data[i,:]; self.diff[i, :] = np.dot( bottom[1].data[i, :, :], bottom[0].data[ i, :]) + bottom[2].data[i, :] - bottom[3].data[i, :] top[0].data[...] = np.sum(self.diff**2) / bottom[3].num / 2. #self.transDiff = np.transpose(self.diff / bottom[3].num); # (65536, 50) a_gpu = gpuarray.to_gpu(self.diff / bottom[3].num) at_gpu = linalg.transpose(a_gpu) self.transDiff = at_gpu
def correlations(X, Y, useGPU): if useGPU: import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.linalg as linalg linalg.init() X_gpu = gpuarray.to_gpu(X) XT_gpu = linalg.transpose(X_gpu) cxx = linalg.mdot(XT_gpu, X_gpu).get() XT_gpu = linalg.transpose(X_gpu) X_gpu.gpudata.free() del X_gpu Y_gpu = gpuarray.to_gpu(Y) cxy = linalg.mdot(XT_gpu, Y_gpu).get() cyx = cxy.T YT_gpu = linalg.transpose(Y_gpu) cyy = linalg.mdot(YT_gpu, Y_gpu).get() else: cxx = np.dot(X.T, X) cxy = np.dot(X.T, Y) cyx = cxy.T cyy = np.dot(Y.T, Y) return cxx, cxy, cyx, cyy
def __init__(self, n_proc=1, stop='p10', epsilon=6, dtype=np.float32): self.stop = stop self.epsilon = epsilon self.profile = False self._locals = {} self.dtype = dtype self.comm = MPI.COMM_WORLD self.rank = self.comm.rank self.n_proc = n_proc #import ctypes #mkl_rt = ctypes.CDLL('libmkl_rt.so') #mkl_get_max_threads = mkl_rt.mkl_get_max_threads #def mkl_set_num_threads(cores): # mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(cores))) #mkl_set_num_threads(6) #print("N thhreads", mkl_get_max_threads()) self.sparse = False #ctx = make_default_context() #ngpus = driver.Device.count() #gpuid = self.rank % ngpus self.ctx = driver.Device(self.rank).make_context() linalg.init() cusparse.init() self.mod = SourceModule(gpucode.code)
def run(images, init_avg_image, n_iters=2, trunc_param=10, beta=np.float64(1.0), ang_jump=1, max_shift=5, shift_jump=1, n_scales=10, is_remove_outliers=True, outliers_precent_removal=5): linalg.init() image_size = np.shape(images)[-1] is_downsample = image_size > config.max_image_size if is_downsample: images_orig = images init_avg_image_orig = init_avg_image images = np.real(data_utils.downsample_decorator(images, config.max_image_size)).astype(init_avg_image.dtype) # TODO: Itay to to handle the fact that returns complex init_avg_image = np.real( data_utils.downsample_decorator(init_avg_image, config.max_image_size)).astype(images.dtype) em = EM(images, init_avg_image, n_iters, trunc_param, beta, ang_jump, max_shift, shift_jump, n_scales, is_remove_outliers, outliers_precent_removal) im_avg_est, log_lik, opt_latent, outlier_ims_inds, posteriors = em.do_em() if is_downsample: images_orig = np.delete(images_orig, outlier_ims_inds, axis=0) images = np.delete(images, outlier_ims_inds, axis=0) em_post_process = EM(images_orig, init_avg_image_orig, em.n_iters, em.converter.truncation, em.converter.beta, em.ang_jump, em.em_params['max_shift'], em.em_params['shift_jump'],em.em_params['n_scales'], is_remove_outliers=False) em_post_process.do_one_pass_orig_images(posteriors, images_orig, images) im_avg_est_orig = em_post_process.converter.direct_backward(em_post_process.c_avg)[0] EM.plot_images(init_avg_image_orig, im_avg_est_orig, im_avg_est_orig) else: im_avg_est_orig = im_avg_est return im_avg_est, im_avg_est_orig, log_lik, opt_latent, outlier_ims_inds
def fast_matmul(x, y, x_type, y_type): ''' use pycuda to compute c = a * b ''' linalg.init() a_gpu = gpuarray.to_gpu(x.astype(x_type)) a_t_gpu = gpuarray.to_gpu(x.T.copy().astype(x_type)) b_gpu = gpuarray.to_gpu(y.astype(y_type)) # row_sum = gpuarray.zeros(shape = x[0].shape, dtype = x_type) row_sum = 0 # a = np.asarray(x, x_type) # b = np.asarray(y, y_type) # a_gpu = gpuarray.to_gpu(a) # b_gpu = gpuarray.to_gpu(b) t1_inside = time.time() c_gpu = linalg.dot(a_gpu, b_gpu) for a_i in a_gpu: # row_sum = misc.add(row_sum, a_i) row_sum += a_i gg = linalg.dot(a_gpu, b_gpu) gg = linalg.dot(a_i, a_i) gg = reduce(linalg.dot, (a_gpu, b_gpu, b_gpu, b_gpu)) # tmp1, tmp2 = linalg.dot(a_gpu, b_gpu), linalg.dot(b_gpu, b_gpu) z_gpu = a_gpu.copy() tmp = a_t_gpu # print('x.T\n', x.T) # print('tmp\n', tmp) # print('x = a_gpu: ', np.allclose(x, a_gpu.get())) # print('x.T = tmp: ', np.allclose(x.T, tmp.get())) a_prod = linalg.dot(a_gpu, tmp) t2_inside = time.time() print('inside cost {:.4f}s'.format(t2_inside - t1_inside)) a = np.random.randint(-5, 5, (3, 4)).astype(np.float32) a_gpu = gpuarray.to_gpu(a) norm_gpu = linalg.norm(a_gpu) print('is norm right?', np.linalg.norm(a) == norm_gpu) a_gpu = abs(a_gpu) column_sum = misc.sum(a_gpu, axis=0) column_sum = column_sum.reshape((1, -1)) all_one_gpu = gpuarray.to_gpu(np.ones((3, 1), np.float32)) div_mat_gpu = linalg.dot(all_one_gpu, column_sum) norm_1 = a_gpu / (div_mat_gpu + 1e-3) print(a_gpu) print(column_sum) print(column_sum.shape) print(norm_1) # abs_a = a_gpu.__abs__() # print(a) # print(abs_a) # c = abs_a + a_gpu # print(repr(c)) # print(type(c)) # c = 1/2 * c # print(a_gpu, c) return c_gpu.get(), a_prod.get(), row_sum.get()
def __init__(self, optimize=True): self.is_fit = False self.train_X, self.train_y = None, None self.params = {"l": 1.0, "sigma_f": 1.0} self.optimize = optimize self.blockNum = 1024 self.threadNum = 32 sklin.init()
def __init__(self, filepath, unknown_word='</s>', limit=100): self.model = KeyedVectors.load_word2vec_format(filepath, limit=limit) self.model.init_sims() self.lookup_table = np.asarray( self.model.wv.vectors_norm ) #np.asarray([self.model[word] for word in self.model.wv.vocab.keys()]) self.vocabulary = np.asarray(list(self.model.wv.vocab.keys())) self.unknown_word = unknown_word self.used_gpu = len(cuda.gpus) - 1 linalg.init()
def _init_cublas(self): import pycuda.autoinit if "cublas_handle" in self.extra_options: handle = self.extra_options["cublas_handle"] else: handle = skmisc._global_cublas_handle if handle is None: cublas.init() # cublas handle + allocator handle = skmisc._global_cublas_handle self.cublas_handle = handle
def skcuda_linalg(a, b): linalg.init() a = np.asarray(a, np.float32) b = np.asarray(b, np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'T') a_nrm = linalg.norm(a_gpu) b_nrm = linalg.norm(b_gpu) type(a_nrm) ans = misc.divide(c_gpu, a_nrm * b_nrm) print ans
def make_sample_data(set_: int): np.random.seed(set_ * 4347) if set_ == 1: # Uniform distribution data = np.random.uniform(0, 1, size=(samples, num_features)) if set_ == 2: # 3 Gaussian distribution data = multi_gauss_clusters(n_clusters=3) if set_ == 3: # 10 Gaussian distribution data = multi_gauss_clusters(n_clusters=10) df = pd.DataFrame() np.random.shuffle(data) df['vec'] = data.tolist() # find nearest neighbours from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=51, algorithm='ball_tree', leaf_size=30).fit(data) _, nbrs_indices = nbrs.kneighbors(data) for n_nbr in range(10, 51, 5): df[f"known_neighbours_{n_nbr}"] = [ x[1:(n_nbr + 1)] for x in nbrs_indices ] # hash using random hyperplane LSH import pycuda.gpuarray as gpuarray import skcuda.linalg as linalg import pycuda.autoinit linalg.init() os.environ['CUDA_HOME'] = "/opt/cuda/" vec_np = np.array(df['vec'].values.tolist(), dtype=np.float32) LSH = LSHBias(feature_dim=num_features, bits=LSH_NUM_BITS) W = np.array(LSH.W, dtype=np.float32) b_gpu = gpuarray.to_gpu(W) ones = np.ones(shape=(vec_np.shape[0], 1), dtype=np.float32) X = np.concatenate((vec_np, ones), axis=1) # do the matrix multiplication a_gpu = gpuarray.to_gpu(X) mul = linalg.mdot(a_gpu, b_gpu) # get binary: 1 if value >= 0, else 0 res = gpuarray.if_positive( mul >= gpuarray.zeros(mul.shape, dtype=np.float32), then_=gpuarray.ones_like(mul), else_=gpuarray.zeros_like(mul)) res = np.array(res.get(), dtype=np.uint32) # convert grouped bits to integers res = np_array_binary_to_grouped_integers(res) df[f"hash_{LSH_NUM_BITS}_bits"] = [x for x in res] df.to_parquet(f"{config.CUDA_neighbour_search_df_dir}df-{set_}.parquet", index=False) print("created test-data")
def fast_add(x, y, x_type, y_type): ''' use pycuda to compute c = a * b ''' linalg.init() a_gpu = gpuarray.to_gpu(x.astype(x_type)) b_gpu = gpuarray.to_gpu(y.astype(y_type)) t1_inside = time.time() # c_gpu = misc.add(a_gpu, b_gpu) c_gpu = a_gpu + b_gpu t2_inside = time.time() print('inside cost {:.4f}s'.format(t2_inside - t1_inside)) return c_gpu.get()
def filter(self): import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg import pycuda.driver as cuda from pycuda.tools import make_default_context cuda.init() context = make_default_context() device = context.get_device() signal = self.series[0] window = self.series[1] linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=self.precision['float']) win_zero_pad = np.zeros(nfft, dtype=self.precision['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=self.precision['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=self.precision['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, self.precision['float'], self.precision['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, self.precision['float'], self.precision['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'], self.precision['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), self.precision['complex']) out_gpu.get(out_np) context.pop() return out_np
def __enter__(self): self.load() ngpus = driver.Device.count() gpuid = self.rank % ngpus self.ctx = driver.Device(gpuid).make_context() cusparse.init() linalg.init() #from crow.transfer.kernels import * #self.operation.mod = mod self.load_gpu()
def backward(self, top, propagate_down, bottom): # self.nPCAcoms = bottom[0].data.shape[1]; with pu.caffe_cuda_context(): #for i in range(self.nPCAcoms): #bottom[0].diff[:, i] = np.trace(np.dot( bottom[1].data[:,:,i], self.transDiff )); linalg.init() for i in range(self.nPCAcoms): ##a = bottom[1].data[:,:,i].data_as_pycuda_gpuarray() a = bottom[1].data[:, :, i] b_gpu = self.transDiff a_gpu = gpuarray.to_gpu(a) c_gpu = linalg.dot(a_gpu, b_gpu) d_gpu = linalg.trace(c_gpu) bottom[0].diff[:, i] = d_gpu
def init_gpu(dev=0): global gp, lg, cm, msc, slf, _gpu from pycuda import gpuarray as gp from pycuda import elementwise as ew from pycuda import cumath as cm from skcuda import linalg as lg from skcuda import misc as msc msc.init_context(msc.init_device(dev)) lg.init() slf = ew.ElementwiseKernel("float * y, float * x, unsigned * ind", "y[i] = x[ind[i]]") _gpu = True
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict): """ Computes the low_pass filter using the numpy pycuda method. Also auto-inits the pycuda library :param signal: The input series :param window: The input window :param prec: The precision entry :return: The filtered signal """ import pycuda.autoinit # Here because it initialises a new cuda environment every trial. import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=prec['float']) win_zero_pad = np.zeros(nfft, dtype=prec['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'], prec['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'], prec['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), prec['complex']) out_gpu.get(out_np) return out_np
def __init__(self, shape, nframes, qmask=None, weights=None, scale_factor=None, precompute_fft_plans=False, extra_options={}): super().__init__( shape, nframes, qmask=qmask, weights=weights, scale_factor=scale_factor, precompute_fft_plans=precompute_fft_plans, extra_options=extra_options ) if CUFFT is None: raise ImportError("pycuda and scikit-cuda need to be installed") if skmisc._global_cublas_handle is None: cublas.init() self._allocate_cuda_arrays() self._compile_kernels()
def __init__(self, cuda=False, exit_on_prompt=False, language='en', limiting_magnitude=None, prefer_fluxes=False, offline=False, prefer_cache=False, open_in_browser=False, pool=None, quiet=False, test=False, wrap_length=100, **kwargs): """Initialize `Fitter` class.""" self._pool = SerialPool() if pool is None else pool self._printer = Printer(pool=self._pool, wrap_length=wrap_length, quiet=quiet, fitter=self, language=language, exit_on_prompt=exit_on_prompt) self._fetcher = Fetcher(test=test, open_in_browser=open_in_browser, printer=self._printer) self._cuda = cuda self._limiting_magnitude = limiting_magnitude self._prefer_fluxes = prefer_fluxes self._offline = offline self._prefer_cache = prefer_cache self._open_in_browser = open_in_browser self._quiet = quiet self._test = test self._wrap_length = wrap_length if self._cuda: try: import pycuda.autoinit # noqa: F401 import skcuda.linalg as linalg linalg.init() except ImportError: pass
def __init__(self, n_proc=1, stop='p10', epsilon=6, dtype=np.float32): self.stop = stop self.epsilon = epsilon self.profile = False self.memory = {} self._locals = {} self.dtype = np.float32 self.sparse = False self.n_proc = 1 self.rank = 0 #ctx = make_default_context() #ngpus = driver.Device.count() #gpuid = self.rank % ngpus self.ctx = driver.Device(0).make_context() linalg.init() cusparse.init() self.mod = SourceModule(gpucode.code)
def logis(y,x): end = 0 start = 0 x = x.astype(np.float32) y = y.astype(np.float32) start=time.time() # Translado de variable a GPU x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.init() # Transpuesta de X x_gpu_T = linalg.transpose(x_gpu) beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu) j = 1 while(True): mu = sapply(x,beta_gpu.get()) mu = mu.astype(np.float32) mu_gpu = gpuarray.to_gpu(mu) V_gpu= linalg.diag(mu_gpu) f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu) f3_gpu = linalg.diag(1/f2_gpu) f4_gpu = (y_gpu-mu_gpu) f5_gpu = linalg.dot(f3_gpu,f4_gpu) if(np.isnan(f5_gpu.get()).any()): f5_cpu = f5_gpu.get() f5_cpu = nanValue(f5_cpu) f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32)) y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu) check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu)) #if(check_value<0.00001): #break if(j == 10 or check_value<0.00001): break beta_gpu = beta_1_gpu j = j + 1 end = time.time() tiempo = (end-start) return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def full_matrix(self): """ Computer the full matrix using the resultant biases, P and Q """ if(self.use_gpu): #initialization linalg.init() #make the appropriate format p_gpu = gpuarray.to_gpu(self.P) q_gpu = gpuarray.to_gpu(self.Q) #we denote as transb='T' that we take the second argument, matrix q_gpu as transpose fullMatrix = self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + linalg.dot(p_gpu, q_gpu, transb='T').get() else: fullMatrix = self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T) return fullMatrix
def __init__(self, inputs, outputs, norm=None, precision=np.float64): super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision) # startup GPU #self.ctx = misc.init_context(misc.init_device(nDevice)) # NO NO NO, crashes and does not release memory # use CUDA_DEVICE=0 python my-script.py try: linalg.init() except OSError as e: pass # no 'cusolver' library which is paid and not needed # print "error initializing scikit-cuda: %s" % e # print "ignore if toolbox works" # precision-dependent stuff if precision is np.float64: self.posv = lapack.dposv else: self.posv = lapack.sposv self.handle = cublas.cublasCreate() # prepare GPU function kernels kernel = """ __global__ void dev_sigm(%s *a) { unsigned idx = blockDim.x * blockIdx.x + threadIdx.x; a[idx] = 1.0 / ( exp(a[idx]) + 1 ); } """ kernel = kernel % "double" if self.precision is np.float64 else kernel % "float" self.dev_sigm = SourceModule(kernel).get_function("dev_sigm") self.dev_sigm.prepare("P") # GPU transformation functions self.func["lin"] = self._dev_lin self.func["sigm"] = self._dev_sigm self.func["tanh"] = self._dev_tanh self.func["rbf_l1"] = self._dev_rbfl1 self.func["rbf_l2"] = self._dev_rbfl2 self.func["rbf_linf"] = self._dev_rbflinf
def get_rating(self, i, j): """ Get the predicted rating of user i and item j, regarding the value of boolean variable 'use_gpu' we do our calculation accordingly """ if(self.use_gpu): #initialization linalg.init() #make the appropriate format p_gpu = gpuarray.to_gpu(self.P) q_gpu = gpuarray.to_gpu(self.Q) prediction = self.b + self.b_u[i] + self.b_i[j] + linalg.dot(p_gpu[i, :],q_gpu[j, :],transb='T') else: prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T) return prediction
def initParallelAlgorithms(): global bitonicSort_ fin = open("ParallelAlgorithms/bitonicSort.cu") mod = SourceModule(fin.read()) fin.close() bitonicSort_ = mod.get_function("bitonicSort") global finishCSM_ global getSumSquares_ fin = open("ParallelAlgorithms/CSMHelper.cu") mod = SourceModule(fin.read()) fin.close() finishCSM_ = mod.get_function("finishCSM") getSumSquares_ = mod.get_function("getSumSquares") #Run each of the algorithms on dummy data so that they're pre-compiled #1) Bitonic Sort X = np.random.randn(16, 16) N = np.int32(16) NPow2 = N NThreads = N / 2 XG = gpuarray.to_gpu(X) bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4 * NPow2) linalg.init() #2) Other primitive operations NegXDotX = linalg.dot(XG, XG) XPlusX = skcuda.misc.add(XG, XG) XSqr = skcuda.misc.multiply(XG, XG) XSqr = skcuda.misc.sum(XSqr, 1) XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
def initParallelAlgorithms(): """ Compile all of the parallel algorithms """ global MatMulNaive_ global MatMulConv2D_ s = getResourceString("MatMul.cu") mod = SourceModule(s) MatMulNaive_ = mod.get_function("MatMulNaive") MatMulConv2D_ = mod.get_function("MatMulConv2D") global ZerosToOnes_ global TileWDenom_ global TileHDenom_ global bitonicSortNonneg_ s = getResourceString("OtherUtils.cu") mod = SourceModule(s) ZerosToOnes_ = mod.get_function("ZerosToOnes") TileWDenom_ = mod.get_function("TileWDenom") TileHDenom_ = mod.get_function("TileHDenom") bitonicSortNonneg_ = mod.get_function("bitonicSortNonneg") linalg.init() skcuda.misc.init()
def __init__(self, arch='cpu', gpu_context=None, dtype='float32'): if arch == 'gpu': global pycuda import pycuda.tools global gpuarray import pycuda.gpuarray as gpuarray global cumath import pycuda.cumath as cumath global ElementwiseKernel from pycuda.elementwise import ElementwiseKernel global SourceModule from pycuda.compiler import SourceModule global culinalg import skcuda.linalg as culinalg global misc import skcuda.misc as misc if gpu_context is None: self.context = pycuda.tools.make_default_context() else: self.context = gpu_context self.device = self.context.get_device() culinalg.init() self.arch = arch
import numpy as np import pycuda.gpuarray as gpuarray import pycuda.autoinit import skcuda.linalg as linalg import time v0 = np.loadtxt('V0.txt').astype(np.float32) w0 = np.loadtxt('W0.txt').astype(np.float32) X = np.loadtxt('nnmf-2429-by-361-face.txt').astype(np.float32) linalg.init() def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1): Vr = V[:,0:r].copy() Wr = W[0:r,:].copy() X_gpu = gpuarray.to_gpu(X) V_gpu = gpuarray.to_gpu(Vr) W_gpu = gpuarray.to_gpu(Wr) #Frobinius norm at previous step B_gpu = linalg.dot(V_gpu, W_gpu) L = linalg.norm(X_gpu-B_gpu)**2 iteration = 0 while 1: #update V V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu)) V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu)) B_gpu = linalg.dot(V_gpu, W_gpu) #update W W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu) W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu) B_gpu = linalg.dot(V_gpu, W_gpu) Lnew = linalg.norm(X_gpu-B_gpu)**2 if abs(Lnew-L) <= tol*(L+1):
def fusion_images(multispectral, panchromatic, save_image=False, savepath=None, timeCondition=True): #Verifica que ambas imagenes cumplan con las condiciones end = 0 start = 0 #Verifica que ambas imagenes cumplan con las condiciones if multispectral.shape[2] == 3: print('The Multispectral image has ' + str(multispectral.shape[2]) + ' channels and size of ' + str(multispectral.shape[0]) + 'x' + str(multispectral.shape[1])) else: sys.exit('The first image is not multispectral') if len(panchromatic.shape) == 2: print(' The Panchromatic image has a size of ' + str(panchromatic.shape[0]) + 'x' + str(panchromatic.shape[1])) else: sys.exit('The second image is not panchromatic') # Convierte a float32 y separa las bandas RGB de la multispectral multispectral = multispectral.astype(np.float32) r = multispectral[:, :, 0].astype(np.float32) g = multispectral[:, :, 1].astype(np.float32) b = multispectral[:, :, 2].astype(np.float32) b = b.astype(np.float32) # Convierte la pancromatica a float32 panchromatic = panchromatic.astype(np.float32) # Suma las bandas de la multispectral msuma = r + g + b start = time.time() r_gpu = gpuarray.to_gpu(r) g_gpu = gpuarray.to_gpu(g) b_gpu = gpuarray.to_gpu(b) panchromatic_gpu = gpuarray.to_gpu(panchromatic) msuma_gpu = gpuarray.to_gpu(msuma) linalg.init() m11_gpu = step_1(r_gpu, msuma_gpu) m22_gpu = step_2(m11_gpu, panchromatic_gpu) m33_gpu = step_1(b_gpu, msuma_gpu) m44_gpu = step_2(m33_gpu, panchromatic_gpu) m55_gpu = step_1(g_gpu, msuma_gpu) m66_gpu = step_2(m55_gpu, panchromatic_gpu) Amax_host, Amin_host = step_3(m22_gpu) rr_gpu = gpuarray.empty_like(r_gpu) step_4(m22_gpu, rr_gpu, Amax_host, Amin_host) Amax_host, Amin_host = step_3(m66_gpu) gg_gpu = gpuarray.empty_like(g_gpu) step_4(m66_gpu, gg_gpu, Amax_host, Amin_host) Amax_host, Amin_host = step_3(m44_gpu) bb_gpu = gpuarray.empty_like(b_gpu) step_4(m44_gpu, bb_gpu, Amax_host, Amin_host) end = time.time() ggg_host = gg_gpu.get().astype(np.uint8) rrr_host = rr_gpu.get().astype(np.uint8) bbb_host = bb_gpu.get().astype(np.uint8) # Combina las bandas resultantes fusioned_image = np.stack((rrr_host, ggg_host, bbb_host), axis=2) if (save_image): # Guarda la imagen resultando de acuerdo al tercer parametro establecido en la linea de ejecución del script if (savepath != None): t = skimage.io.imsave(savepath + '/broveygpu_image.tif', fusioned_image, plugin='tifffile') else: t = skimage.io.imsave('broveygpu_image.tif', fusioned_image, plugin='tifffile') #time_calculated de ejecución para la transformada de Brovey en GPU time_calculated = (end - start) if (timeCondition): return {"image": fusioned_image, "time": time_calculated} else: return fusioned_image
import theano import numpy as np from pycuda import gpuarray, cumath from skcuda import linalg, misc, cudart from theano.tensor import as_tensor_variable from theano.gof import Op, Apply import theano.tensor as tt import pycuda.autoinit cudart.cudaSetDevice(0) linalg.init() class logDect(Op): def make_node(self, *inputs): alpha = as_tensor_variable(inputs[0]) xt = as_tensor_variable(inputs[1]) xf = as_tensor_variable(inputs[2]) ll = as_tensor_variable(.1) return Apply(self, [alpha, xt, xf], [ll.type()]) def make_thunk(self, node, storage_map, compute_map, rem=None, impl=None, no_recycling=[]): inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] A = gpuarray.to_gpu(self.A) b = gpuarray.to_gpu(self.b)
#!/usr/bin/env python """ Demonstrates how to transpose matrices on the GPU. """ from __future__ import print_function import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print('Testing transpose for type ' + str(np.dtype(t))) if np.iscomplexobj(t()): b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], t) else: a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], t)
def elmvis(Xraw, A, slowdown=10, report=5, maxtime=24*60*60, tol=0, batch=None, maxiter=None, maxupdate=None, maxstall=None, cossim=None, silent=False): """ELMVIS+ function running in GPU memory. """ X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None] # unit-length version of X Xh = np.dot(A, X) # X_hat, predicted value of X N, d = X.shape I = np.arange(N) # index of samples # set default values if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N if maxiter is None: maxiter = N*N*N if maxupdate is None: maxupdate = N*N if maxstall is None: maxstall = N*N if not silent: print "original similarity: ", cossim # init GPU dt = X.dtype.type try: linalg.init() except ImportError as e: print e devA = gpuarray.to_gpu(A.astype(dt)) devX = gpuarray.to_gpu(X.astype(dt)) devXi1 = gpuarray.empty((d,), dtype=dt) devXh = linalg.dot(devA, devX) devAi = gpuarray.empty((N, 2), dtype=dt) devDelta = gpuarray.empty((2, d), dtype=dt) result = gpuarray.empty((d,), dtype=dt) # swap kernel kernel = """ __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) { long j = blockDim.x * blockIdx.x + threadIdx.x; %s yi1 = Y[i1*d + j]; %s yi2 = Y[i2*d + j]; result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) + (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2); } """ if dt is np.float64: kernel = kernel % ("double", "double", "double", "double", "double", "double") else: kernel = kernel % ("float", "float", "float", "float", "float", "float") mod_diff = SourceModule(kernel) dev_diff = mod_diff.get_function("diff") dev_diff.prepare("PPPPllll") block = result._block grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1) t0 = tlast = time() stall = 0 iters = 0 updates = 0 updates_last = 0 iters_last = 0 ups_max = 0 while (iters < maxiter) and (stall < maxstall): iters += 1 stall += 1 # get two different random numbers i1, i2 = np.random.randint(0, N, size=2) while i1 == i2: i1, i2 = np.random.randint(0, N, size=2) dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2) diff = np.sum(result.get()) if diff > tol: stall = 0 devAi[:, 0] = devA[:, i1] devAi[:, 1] = devA[:, i2] devDelta[0, :] = devX[i1, :] - devX[i2, :] devDelta[1, :] = devX[i2, :] - devX[i1, :] linalg.add_dot(devAi, devDelta, devXh, alpha=-1) tI = I[i1] I[i1] = I[i2] I[i2] = tI devXi1[:] = devX[i1, :] devX[i1] = devX[i2] devX[i2] = devXi1 cossim += diff / N updates += 1 if updates > maxupdate: break t = time() if t - tlast > report: ups = (updates-updates_last)*1.0/(t-tlast) ips = (iters-iters_last)*1.0/(t-tlast) if not silent: print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (iters, updates, ips, ups, cossim) updates_last = updates iters_last = iters tlast = t ups_max = max(ups, ups_max) if ups < ups_max/slowdown: break if t - t0 > maxtime: break ips = iters*1.0/(time()-t0) ups = updates*1.0/(time()-t0) Xraw[:] = Xraw[I] cossim = np.trace(X.T.dot(A).dot(X)) / N if not silent: print "final similarity: ", cossim info = {'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups} return I, info
def setUp(self): np.random.seed(0) linalg.init()
import pycuda.gpuarray as gpuarray import pycuda.autoinit from pycuda.compiler import SourceModule from pycuda.reduction import ReductionKernel from pycuda.tools import dtype_to_ctype import numpy as np from fractions import gcd from types import MethodType, FunctionType import cufft import skcuda.linalg as cu_linalg cu_linalg.init() class RhoVNeumannCUDA1D: """ The second-order split-operator propagator for the von Neumann equation for the denisty matrix rho(x,x',t) with the time-dependent Hamiltonian H = K(p, t) + V(x, t). The Wigner function is obtained by padded Wigner transforming the (rectangular) density matrix. """ def __init__(self, **kwargs): """ The following parameters are to be specified X_gridDIM - the coordinate grid size X_amplitude - maximum value of the coordinates t (optional) - initial value of time (default t = 0) consts (optional) - a string of the C code declaring the constants functions (optional) - a string of the C code declaring auxiliary functions