Example #1
0
    def __init__(self,
                 data,
                 alpha=10,
                 norm_type=1,
                 verbose=False,
                 step=5,
                 thr=[10**-8, -1],
                 max_iter=5000,
                 affine=False,
                 normalize=True,
                 PCA=False,
                 npc=10,
                 GPU=False,
                 device=0):

        self.data = data
        self.alpha = alpha
        self.norm_type = norm_type
        self.verbose = verbose
        self.step = step
        self.thr = thr
        self.max_iter = max_iter
        self.affine = affine
        self.normalize = normalize
        self.device = device
        self.PCA = PCA
        self.npc = npc
        self.GPU = GPU

        self.num_rows = data.shape[0]
        self.num_columns = data.shape[1]

        if (self.GPU == True):
            # self.data = self.data.astype('float32')
            linalg.init()
def initParallelAlgorithms():
    global bitonicSort_
    fin = open("ParallelAlgorithms/bitonicSort.cu")
    mod = SourceModule(fin.read())
    fin.close()
    bitonicSort_ = mod.get_function("bitonicSort")

    global finishCSM_
    global getSumSquares_
    fin = open("ParallelAlgorithms/CSMHelper.cu")
    mod = SourceModule(fin.read())
    fin.close()
    finishCSM_ = mod.get_function("finishCSM")
    getSumSquares_ = mod.get_function("getSumSquares")

    #Run each of the algorithms on dummy data so that they're pre-compiled

    #1) Bitonic Sort
    X = np.random.randn(16, 16)
    N = np.int32(16)
    NPow2 = N
    NThreads = N/2
    XG = gpuarray.to_gpu(X)
    bitonicSort_(XG, N, NPow2, block=(NThreads, 1, 1), grid=(X.shape[0], 1), shared=4*NPow2)

    linalg.init()
    #2) Other primitive operations
    NegXDotX = linalg.dot(XG, XG)
    XPlusX = skcuda.misc.add(XG, XG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
Example #3
0
    def forward(self, bottom, top):
        #        print 'hanli crf forward -- '
        #        print 'self.diff.shape: ' + str(self.diff.shape);  # self.diff.shape: (batchsize, 65536)
        #        print 'crf bottom[0].data.shape: ' + str(bottom[0].data.shape); #crf bottom[0].data.shape: (batchsize, 11)
        #        print 'raw degree bottom[1].data.shape: ' + str(bottom[1].data.shape);  #(batchsize, 65536, 11)
        #        print 'png bottom[2].data.shape: ' + str(bottom[2].data.shape);  # (batchsize, 65536)
        #        print 'np.dot(bottom[1].data[i,:,:], bottom[0].data[i,:]).shape: ' + str(np.dot(bottom[1].data[0,:,:], bottom[0].data[0,:]).shape); #(65536,)
        #        print 'bottom[2].data[i,:].shape: ' + str(bottom[2].data[0,:].shape);  # (65536,)
        with pu.caffe_cuda_context():
            linalg.init()
            for i in range(self.diff.shape[0]):
                #a =  bottom[1].data_as_pycuda_gpuarray()
                #b =  bottom[0].data_as_pycuda_gpuarray()
                a = bottom[1].data[i, :, :].astype(np.float32)
                b = bottom[0].data[i, :].astype(np.float32)
                ##a = np.asarray(np.random.rand(4, 4), dtype=np.float32)
                ##b = np.asarray(np.random.rand(4), dtype=np.float32)

                #a_gpu = gpuarray.GPUArray(a, dtype=np.float32)
                #b_gpu = gpuarray.GPUArray(b, dtype=np.float32)
                a_gpu = gpuarray.to_gpu(a)
                b_gpu = gpuarray.to_gpu(b)
                c_gpu = linalg.dot(a_gpu, b_gpu)
                #self.diff[i,:] = c_gpu + bottom[2].data[i,:] - bottom[3].data[i,:];
                self.diff[i, :] = np.dot(
                    bottom[1].data[i, :, :], bottom[0].data[
                        i, :]) + bottom[2].data[i, :] - bottom[3].data[i, :]
            top[0].data[...] = np.sum(self.diff**2) / bottom[3].num / 2.
            #self.transDiff = np.transpose(self.diff / bottom[3].num); # (65536, 50)
            a_gpu = gpuarray.to_gpu(self.diff / bottom[3].num)
            at_gpu = linalg.transpose(a_gpu)
            self.transDiff = at_gpu
Example #4
0
def correlations(X, Y, useGPU):
    if useGPU:
        import pycuda.autoinit
        import pycuda.gpuarray as gpuarray
        import skcuda.linalg as linalg
        linalg.init()

        X_gpu = gpuarray.to_gpu(X)
        XT_gpu = linalg.transpose(X_gpu)
        cxx = linalg.mdot(XT_gpu, X_gpu).get()

        XT_gpu = linalg.transpose(X_gpu)
        X_gpu.gpudata.free()
        del X_gpu
        Y_gpu = gpuarray.to_gpu(Y)
        cxy = linalg.mdot(XT_gpu, Y_gpu).get()

        cyx = cxy.T

        YT_gpu = linalg.transpose(Y_gpu)
        cyy = linalg.mdot(YT_gpu, Y_gpu).get()
    else:
        cxx = np.dot(X.T, X)
        cxy = np.dot(X.T, Y)
        cyx = cxy.T
        cyy = np.dot(Y.T, Y)

    return cxx, cxy, cyx, cyy
Example #5
0
    def __init__(self, n_proc=1, stop='p10', epsilon=6, dtype=np.float32):
        self.stop = stop
        self.epsilon = epsilon

        self.profile = False
        self._locals = {}
        self.dtype = dtype
        self.comm = MPI.COMM_WORLD
        self.rank = self.comm.rank
        self.n_proc = n_proc
        #import ctypes
        #mkl_rt = ctypes.CDLL('libmkl_rt.so')
        #mkl_get_max_threads = mkl_rt.mkl_get_max_threads
        #def mkl_set_num_threads(cores):
        #    mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(cores)))

        #mkl_set_num_threads(6)
        #print("N thhreads", mkl_get_max_threads())
        self.sparse = False
        #ctx = make_default_context()
        #ngpus = driver.Device.count()
        #gpuid = self.rank % ngpus

        self.ctx = driver.Device(self.rank).make_context()

        linalg.init()
        cusparse.init()

        self.mod = SourceModule(gpucode.code)
Example #6
0
def run(images, init_avg_image, n_iters=2, trunc_param=10, beta=np.float64(1.0), ang_jump=1, max_shift=5,
           shift_jump=1, n_scales=10, is_remove_outliers=True, outliers_precent_removal=5):

    linalg.init()

    image_size = np.shape(images)[-1]
    is_downsample = image_size > config.max_image_size
    if is_downsample:
        images_orig = images
        init_avg_image_orig = init_avg_image
        images = np.real(data_utils.downsample_decorator(images, config.max_image_size)).astype(init_avg_image.dtype)  # TODO: Itay to to handle the fact that returns complex
        init_avg_image = np.real(
            data_utils.downsample_decorator(init_avg_image, config.max_image_size)).astype(images.dtype)

    em = EM(images, init_avg_image, n_iters, trunc_param, beta, ang_jump, max_shift, shift_jump,
                n_scales, is_remove_outliers, outliers_precent_removal)

    im_avg_est, log_lik, opt_latent, outlier_ims_inds, posteriors = em.do_em()

    if is_downsample:
        images_orig = np.delete(images_orig, outlier_ims_inds, axis=0)
        images = np.delete(images, outlier_ims_inds, axis=0)
        em_post_process = EM(images_orig, init_avg_image_orig, em.n_iters, em.converter.truncation, em.converter.beta, em.ang_jump,
                             em.em_params['max_shift'], em.em_params['shift_jump'],em.em_params['n_scales'], is_remove_outliers=False)
        em_post_process.do_one_pass_orig_images(posteriors, images_orig, images)
        im_avg_est_orig = em_post_process.converter.direct_backward(em_post_process.c_avg)[0]
        EM.plot_images(init_avg_image_orig, im_avg_est_orig, im_avg_est_orig)
    else:
        im_avg_est_orig = im_avg_est

    return im_avg_est, im_avg_est_orig, log_lik, opt_latent, outlier_ims_inds
Example #7
0
def fast_matmul(x, y, x_type, y_type):
    '''
    use pycuda to compute c = a * b
    '''
    linalg.init()
    a_gpu = gpuarray.to_gpu(x.astype(x_type))
    a_t_gpu = gpuarray.to_gpu(x.T.copy().astype(x_type))
    b_gpu = gpuarray.to_gpu(y.astype(y_type))
    # row_sum = gpuarray.zeros(shape = x[0].shape, dtype = x_type)
    row_sum = 0
    # a = np.asarray(x, x_type)
    # b = np.asarray(y, y_type)
    # a_gpu = gpuarray.to_gpu(a)
    # b_gpu = gpuarray.to_gpu(b)

    t1_inside = time.time()
    c_gpu = linalg.dot(a_gpu, b_gpu)
    for a_i in a_gpu:
        # row_sum = misc.add(row_sum, a_i)
        row_sum += a_i
        gg = linalg.dot(a_gpu, b_gpu)
        gg = linalg.dot(a_i, a_i)
        gg = reduce(linalg.dot, (a_gpu, b_gpu, b_gpu, b_gpu))
        # tmp1, tmp2 = linalg.dot(a_gpu, b_gpu), linalg.dot(b_gpu, b_gpu)
        z_gpu = a_gpu.copy()
    tmp = a_t_gpu
    # print('x.T\n', x.T)
    # print('tmp\n', tmp)
    # print('x = a_gpu: ', np.allclose(x, a_gpu.get()))
    # print('x.T = tmp: ', np.allclose(x.T, tmp.get()))

    a_prod = linalg.dot(a_gpu, tmp)
    t2_inside = time.time()
    print('inside cost {:.4f}s'.format(t2_inside - t1_inside))

    a = np.random.randint(-5, 5, (3, 4)).astype(np.float32)
    a_gpu = gpuarray.to_gpu(a)
    norm_gpu = linalg.norm(a_gpu)
    print('is norm right?', np.linalg.norm(a) == norm_gpu)
    a_gpu = abs(a_gpu)
    column_sum = misc.sum(a_gpu, axis=0)
    column_sum = column_sum.reshape((1, -1))
    all_one_gpu = gpuarray.to_gpu(np.ones((3, 1), np.float32))
    div_mat_gpu = linalg.dot(all_one_gpu, column_sum)

    norm_1 = a_gpu / (div_mat_gpu + 1e-3)

    print(a_gpu)
    print(column_sum)
    print(column_sum.shape)
    print(norm_1)
    # abs_a = a_gpu.__abs__()
    # print(a)
    # print(abs_a)
    # c = abs_a + a_gpu
    # print(repr(c))
    # print(type(c))
    # c = 1/2 * c
    # print(a_gpu, c)
    return c_gpu.get(), a_prod.get(), row_sum.get()
Example #8
0
 def __init__(self, optimize=True):
     self.is_fit = False
     self.train_X, self.train_y = None, None
     self.params = {"l": 1.0, "sigma_f": 1.0}
     self.optimize = optimize
     self.blockNum = 1024
     self.threadNum = 32
     sklin.init()
Example #9
0
 def __init__(self, filepath, unknown_word='</s>', limit=100):
     self.model = KeyedVectors.load_word2vec_format(filepath, limit=limit)
     self.model.init_sims()
     self.lookup_table = np.asarray(
         self.model.wv.vectors_norm
     )  #np.asarray([self.model[word] for word in self.model.wv.vocab.keys()])
     self.vocabulary = np.asarray(list(self.model.wv.vocab.keys()))
     self.unknown_word = unknown_word
     self.used_gpu = len(cuda.gpus) - 1
     linalg.init()
Example #10
0
 def _init_cublas(self):
     import pycuda.autoinit
     if "cublas_handle" in self.extra_options:
         handle = self.extra_options["cublas_handle"]
     else:
         handle = skmisc._global_cublas_handle
         if handle is None:
             cublas.init() # cublas handle + allocator
             handle = skmisc._global_cublas_handle
     self.cublas_handle = handle
def skcuda_linalg(a, b):
    linalg.init()
    a = np.asarray(a, np.float32)
    b = np.asarray(b, np.float32)
    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    c_gpu = linalg.dot(a_gpu, b_gpu, 'T')
    a_nrm = linalg.norm(a_gpu)
    b_nrm = linalg.norm(b_gpu)
    type(a_nrm)
    ans = misc.divide(c_gpu, a_nrm * b_nrm)
    print ans
Example #12
0
def make_sample_data(set_: int):
    np.random.seed(set_ * 4347)
    if set_ == 1:  # Uniform distribution
        data = np.random.uniform(0, 1, size=(samples, num_features))
    if set_ == 2:  # 3 Gaussian distribution
        data = multi_gauss_clusters(n_clusters=3)
    if set_ == 3:  # 10 Gaussian distribution
        data = multi_gauss_clusters(n_clusters=10)
    df = pd.DataFrame()
    np.random.shuffle(data)
    df['vec'] = data.tolist()

    # find nearest neighbours
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=51,
                            algorithm='ball_tree',
                            leaf_size=30).fit(data)
    _, nbrs_indices = nbrs.kneighbors(data)
    for n_nbr in range(10, 51, 5):
        df[f"known_neighbours_{n_nbr}"] = [
            x[1:(n_nbr + 1)] for x in nbrs_indices
        ]

    # hash using random hyperplane LSH
    import pycuda.gpuarray as gpuarray
    import skcuda.linalg as linalg
    import pycuda.autoinit
    linalg.init()
    os.environ['CUDA_HOME'] = "/opt/cuda/"
    vec_np = np.array(df['vec'].values.tolist(), dtype=np.float32)
    LSH = LSHBias(feature_dim=num_features, bits=LSH_NUM_BITS)
    W = np.array(LSH.W, dtype=np.float32)
    b_gpu = gpuarray.to_gpu(W)
    ones = np.ones(shape=(vec_np.shape[0], 1), dtype=np.float32)
    X = np.concatenate((vec_np, ones), axis=1)

    # do the matrix multiplication
    a_gpu = gpuarray.to_gpu(X)
    mul = linalg.mdot(a_gpu, b_gpu)
    # get binary: 1 if value >= 0, else 0
    res = gpuarray.if_positive(
        mul >= gpuarray.zeros(mul.shape, dtype=np.float32),
        then_=gpuarray.ones_like(mul),
        else_=gpuarray.zeros_like(mul))
    res = np.array(res.get(), dtype=np.uint32)

    # convert grouped bits to integers
    res = np_array_binary_to_grouped_integers(res)
    df[f"hash_{LSH_NUM_BITS}_bits"] = [x for x in res]
    df.to_parquet(f"{config.CUDA_neighbour_search_df_dir}df-{set_}.parquet",
                  index=False)

    print("created test-data")
Example #13
0
def fast_add(x, y, x_type, y_type):
    '''
    use pycuda to compute c = a * b
    '''
    linalg.init()
    a_gpu = gpuarray.to_gpu(x.astype(x_type))
    b_gpu = gpuarray.to_gpu(y.astype(y_type))

    t1_inside = time.time()
    # c_gpu = misc.add(a_gpu, b_gpu)
    c_gpu = a_gpu + b_gpu
    t2_inside = time.time()
    print('inside cost {:.4f}s'.format(t2_inside - t1_inside))
    return c_gpu.get()
Example #14
0
    def filter(self):
        import pycuda.gpuarray as gpuarray
        import skcuda.fft as cu_fft
        import skcuda.linalg as linalg
        import pycuda.driver as cuda
        from pycuda.tools import make_default_context
        cuda.init()
        context = make_default_context()
        device = context.get_device()
        signal = self.series[0]
        window = self.series[1]
        linalg.init()
        nfft = determine_size(len(signal) + len(window) - 1)
        # Move data to GPU
        sig_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        win_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        sig_gpu = gpuarray.zeros(sig_zero_pad.shape,
                                 dtype=self.precision['float'])
        win_gpu = gpuarray.zeros(win_zero_pad.shape,
                                 dtype=self.precision['float'])
        sig_zero_pad[0:len(signal)] = signal
        win_zero_pad[0:len(window)] = window
        sig_gpu.set(sig_zero_pad)
        win_gpu.set(win_zero_pad)

        # Plan forwards
        sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        win_plan_forward = cu_fft.Plan(win_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
        cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

        # Convolve
        out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
        linalg.scale(2.0, out_fft)

        # Plan inverse
        out_gpu = gpuarray.zeros_like(out_fft)
        plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'],
                                   self.precision['complex'])
        cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
        out_np = np.zeros(len(out_gpu), self.precision['complex'])
        out_gpu.get(out_np)
        context.pop()
        return out_np
Example #15
0
    def __enter__(self):
        self.load()

        ngpus = driver.Device.count()
        gpuid = self.rank % ngpus

        self.ctx = driver.Device(gpuid).make_context()

        cusparse.init()
        linalg.init()

        #from crow.transfer.kernels import *
        #self.operation.mod = mod

        self.load_gpu()
Example #16
0
    def backward(self, top, propagate_down, bottom):

        #        self.nPCAcoms = bottom[0].data.shape[1];
        with pu.caffe_cuda_context():
            #for i in range(self.nPCAcoms):
            #bottom[0].diff[:, i] = np.trace(np.dot( bottom[1].data[:,:,i], self.transDiff ));
            linalg.init()
            for i in range(self.nPCAcoms):
                ##a =  bottom[1].data[:,:,i].data_as_pycuda_gpuarray()
                a = bottom[1].data[:, :, i]
                b_gpu = self.transDiff
                a_gpu = gpuarray.to_gpu(a)
                c_gpu = linalg.dot(a_gpu, b_gpu)
                d_gpu = linalg.trace(c_gpu)
                bottom[0].diff[:, i] = d_gpu
Example #17
0
def init_gpu(dev=0):
    global gp, lg, cm, msc, slf, _gpu

    from pycuda import gpuarray as gp
    from pycuda import elementwise as ew
    from pycuda import cumath as cm
    from skcuda import linalg as lg
    from skcuda import misc as msc

    msc.init_context(msc.init_device(dev))
    lg.init()

    slf = ew.ElementwiseKernel("float * y, float * x, unsigned * ind",
                               "y[i] = x[ind[i]]")

    _gpu = True
Example #18
0
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict):
    """
    Computes the low_pass filter using the numpy pycuda method.
    Also auto-inits the pycuda library
    :param signal: The input series
    :param window: The input window
    :param prec: The precision entry
    :return: The filtered signal
    """
    import pycuda.autoinit  # Here because it initialises a new cuda environment every trial.
    import pycuda.gpuarray as gpuarray
    import skcuda.fft as cu_fft
    import skcuda.linalg as linalg
    linalg.init()
    nfft = determine_size(len(signal) + len(window) - 1)
    # Move data to GPU
    sig_zero_pad = np.zeros(nfft, dtype=prec['float'])
    win_zero_pad = np.zeros(nfft, dtype=prec['float'])
    sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float'])
    win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float'])
    sig_zero_pad[0:len(signal)] = signal
    win_zero_pad[0:len(window)] = window
    sig_gpu.set(sig_zero_pad)
    win_gpu.set(win_zero_pad)

    # Plan forwards
    sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex'])
    win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex'])
    sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'],
                                   prec['complex'])
    win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'],
                                   prec['complex'])
    cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
    cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

    # Convolve
    out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
    linalg.scale(2.0, out_fft)

    # Plan inverse
    out_gpu = gpuarray.zeros_like(out_fft)
    plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex'])
    cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
    out_np = np.zeros(len(out_gpu), prec['complex'])
    out_gpu.get(out_np)
    return out_np
Example #19
0
 def __init__(self, shape, nframes,
              qmask=None,
              weights=None,
              scale_factor=None,
              precompute_fft_plans=False,
              extra_options={}):
     super().__init__(
         shape, nframes, qmask=qmask,
         weights=weights, scale_factor=scale_factor,
         precompute_fft_plans=precompute_fft_plans, extra_options=extra_options
     )
     if CUFFT is None:
         raise ImportError("pycuda and scikit-cuda need to be installed")
     if skmisc._global_cublas_handle is None:
         cublas.init()
     self._allocate_cuda_arrays()
     self._compile_kernels()
Example #20
0
    def __init__(self,
                 cuda=False,
                 exit_on_prompt=False,
                 language='en',
                 limiting_magnitude=None,
                 prefer_fluxes=False,
                 offline=False,
                 prefer_cache=False,
                 open_in_browser=False,
                 pool=None,
                 quiet=False,
                 test=False,
                 wrap_length=100,
                 **kwargs):
        """Initialize `Fitter` class."""
        self._pool = SerialPool() if pool is None else pool
        self._printer = Printer(pool=self._pool,
                                wrap_length=wrap_length,
                                quiet=quiet,
                                fitter=self,
                                language=language,
                                exit_on_prompt=exit_on_prompt)
        self._fetcher = Fetcher(test=test,
                                open_in_browser=open_in_browser,
                                printer=self._printer)

        self._cuda = cuda
        self._limiting_magnitude = limiting_magnitude
        self._prefer_fluxes = prefer_fluxes
        self._offline = offline
        self._prefer_cache = prefer_cache
        self._open_in_browser = open_in_browser
        self._quiet = quiet
        self._test = test
        self._wrap_length = wrap_length

        if self._cuda:
            try:
                import pycuda.autoinit  # noqa: F401
                import skcuda.linalg as linalg
                linalg.init()
            except ImportError:
                pass
Example #21
0
    def __init__(self, n_proc=1, stop='p10', epsilon=6, dtype=np.float32):
        self.stop = stop
        self.epsilon = epsilon
        self.profile = False
        self.memory = {}
        self._locals = {}
        self.dtype = np.float32
        self.sparse = False
        self.n_proc = 1
        self.rank = 0
        #ctx = make_default_context()
        #ngpus = driver.Device.count()
        #gpuid = self.rank % ngpus

        self.ctx = driver.Device(0).make_context()

        linalg.init()
        cusparse.init()

        self.mod = SourceModule(gpucode.code)
def logis(y,x):
    end = 0
    start = 0
    x = x.astype(np.float32)
    y = y.astype(np.float32)
    start=time.time()
    # Translado de variable a GPU
    x_gpu = gpuarray.to_gpu(x)
    y_gpu = gpuarray.to_gpu(y)

    linalg.init()
    # Transpuesta de X
    x_gpu_T = linalg.transpose(x_gpu)
    beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu)
    j = 1
    while(True):
        mu = sapply(x,beta_gpu.get())
        mu = mu.astype(np.float32)
        mu_gpu = gpuarray.to_gpu(mu)
        V_gpu= linalg.diag(mu_gpu)
        f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu)
        f3_gpu = linalg.diag(1/f2_gpu)
        f4_gpu = (y_gpu-mu_gpu)
        f5_gpu = linalg.dot(f3_gpu,f4_gpu)
        if(np.isnan(f5_gpu.get()).any()):
            f5_cpu = f5_gpu.get()
            f5_cpu = nanValue(f5_cpu)
            f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32))
        y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu
        beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu)
        check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu))
        #if(check_value<0.00001):
            #break
        if(j == 10 or check_value<0.00001):
            break
        beta_gpu = beta_1_gpu
        j = j + 1
    end = time.time()
    tiempo = (end-start)
    return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        
        if(self.use_gpu):
            
            #initialization
            linalg.init()

            #make the appropriate format
            p_gpu = gpuarray.to_gpu(self.P)
            q_gpu = gpuarray.to_gpu(self.Q)

            #we denote as transb='T' that we take the second argument, matrix q_gpu as transpose
            fullMatrix = self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + linalg.dot(p_gpu, q_gpu, transb='T').get()
        
        else:
            
            fullMatrix = self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)
            
        return fullMatrix
Example #24
0
    def __init__(self, inputs, outputs, norm=None, precision=np.float64):
        super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision)

        # startup GPU
        #self.ctx = misc.init_context(misc.init_device(nDevice))  # NO NO NO, crashes and does not release memory
        # use CUDA_DEVICE=0 python my-script.py
        try:
            linalg.init()
        except OSError as e:
            pass  # no 'cusolver' library which is paid and not needed
            # print "error initializing scikit-cuda: %s" % e
            # print "ignore if toolbox works"

        # precision-dependent stuff
        if precision is np.float64:
            self.posv = lapack.dposv
        else:
            self.posv = lapack.sposv
            self.handle = cublas.cublasCreate()

        # prepare GPU function kernels
        kernel = """
            __global__ void dev_sigm(%s *a) {
                unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
                a[idx] = 1.0 / ( exp(a[idx]) + 1 );
            }
            """
        kernel = kernel % "double" if self.precision is np.float64 else kernel % "float"
        self.dev_sigm = SourceModule(kernel).get_function("dev_sigm")
        self.dev_sigm.prepare("P")

        # GPU transformation functions
        self.func["lin"] = self._dev_lin
        self.func["sigm"] = self._dev_sigm
        self.func["tanh"] = self._dev_tanh
        self.func["rbf_l1"] = self._dev_rbfl1
        self.func["rbf_l2"] = self._dev_rbfl2
        self.func["rbf_linf"] = self._dev_rbflinf
Example #25
0
    def __init__(self, inputs, outputs, norm=None, precision=np.float64):
        super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision)

        # startup GPU
        #self.ctx = misc.init_context(misc.init_device(nDevice))  # NO NO NO, crashes and does not release memory
        # use CUDA_DEVICE=0 python my-script.py
        try:
            linalg.init()
        except OSError as e:
            pass  # no 'cusolver' library which is paid and not needed
            # print "error initializing scikit-cuda: %s" % e
            # print "ignore if toolbox works"

        # precision-dependent stuff
        if precision is np.float64:
            self.posv = lapack.dposv
        else:
            self.posv = lapack.sposv
            self.handle = cublas.cublasCreate()

        # prepare GPU function kernels
        kernel = """
            __global__ void dev_sigm(%s *a) {
                unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
                a[idx] = 1.0 / ( exp(a[idx]) + 1 );
            }
            """
        kernel = kernel % "double" if self.precision is np.float64 else kernel % "float"
        self.dev_sigm = SourceModule(kernel).get_function("dev_sigm")
        self.dev_sigm.prepare("P")

        # GPU transformation functions
        self.func["lin"] = self._dev_lin
        self.func["sigm"] = self._dev_sigm
        self.func["tanh"] = self._dev_tanh
        self.func["rbf_l1"] = self._dev_rbfl1
        self.func["rbf_l2"] = self._dev_rbfl2
        self.func["rbf_linf"] = self._dev_rbflinf
    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j, regarding the value of boolean variable 'use_gpu' we do our calculation accordingly 
        """
        
        if(self.use_gpu):
            
            #initialization
            linalg.init()

            #make the appropriate format
            p_gpu = gpuarray.to_gpu(self.P)
            q_gpu = gpuarray.to_gpu(self.Q)

            prediction = self.b + self.b_u[i] + self.b_i[j] + linalg.dot(p_gpu[i, :],q_gpu[j, :],transb='T')
        
        
        else:
                    
            prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        
        
        return prediction
def initParallelAlgorithms():
    global bitonicSort_
    fin = open("ParallelAlgorithms/bitonicSort.cu")
    mod = SourceModule(fin.read())
    fin.close()
    bitonicSort_ = mod.get_function("bitonicSort")

    global finishCSM_
    global getSumSquares_
    fin = open("ParallelAlgorithms/CSMHelper.cu")
    mod = SourceModule(fin.read())
    fin.close()
    finishCSM_ = mod.get_function("finishCSM")
    getSumSquares_ = mod.get_function("getSumSquares")

    #Run each of the algorithms on dummy data so that they're pre-compiled

    #1) Bitonic Sort
    X = np.random.randn(16, 16)
    N = np.int32(16)
    NPow2 = N
    NThreads = N / 2
    XG = gpuarray.to_gpu(X)
    bitonicSort_(XG,
                 N,
                 NPow2,
                 block=(NThreads, 1, 1),
                 grid=(X.shape[0], 1),
                 shared=4 * NPow2)

    linalg.init()
    #2) Other primitive operations
    NegXDotX = linalg.dot(XG, XG)
    XPlusX = skcuda.misc.add(XG, XG)
    XSqr = skcuda.misc.multiply(XG, XG)
    XSqr = skcuda.misc.sum(XSqr, 1)
    XPlusCol = skcuda.misc.add_matvec(XG, XSqr, 0)
Example #28
0
def initParallelAlgorithms():
    """
    Compile all of the parallel algorithms
    """
    global MatMulNaive_
    global MatMulConv2D_
    s = getResourceString("MatMul.cu")
    mod = SourceModule(s)
    MatMulNaive_ = mod.get_function("MatMulNaive")
    MatMulConv2D_ = mod.get_function("MatMulConv2D")

    global ZerosToOnes_
    global TileWDenom_
    global TileHDenom_
    global bitonicSortNonneg_
    s = getResourceString("OtherUtils.cu")
    mod = SourceModule(s)
    ZerosToOnes_ = mod.get_function("ZerosToOnes")
    TileWDenom_ = mod.get_function("TileWDenom")
    TileHDenom_ = mod.get_function("TileHDenom")
    bitonicSortNonneg_ = mod.get_function("bitonicSortNonneg")

    linalg.init()
    skcuda.misc.init()
Example #29
0
    def __init__(self, arch='cpu', gpu_context=None, dtype='float32'):
        if arch == 'gpu':
            global pycuda
            import pycuda.tools
            global gpuarray
            import pycuda.gpuarray as gpuarray
            global cumath
            import pycuda.cumath as cumath
            global ElementwiseKernel
            from pycuda.elementwise import ElementwiseKernel
            global SourceModule
            from pycuda.compiler import SourceModule
            global culinalg
            import skcuda.linalg as culinalg
            global misc
            import skcuda.misc as misc
            if gpu_context is None:
                self.context = pycuda.tools.make_default_context()
            else:
                self.context = gpu_context
            self.device = self.context.get_device()
            culinalg.init()

        self.arch = arch
Example #30
0
import numpy as np
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import skcuda.linalg as linalg
import time

v0 = np.loadtxt('V0.txt').astype(np.float32)
w0 = np.loadtxt('W0.txt').astype(np.float32)
X  = np.loadtxt('nnmf-2429-by-361-face.txt').astype(np.float32)

linalg.init()
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1):
    Vr = V[:,0:r].copy()
    Wr = W[0:r,:].copy()
    X_gpu = gpuarray.to_gpu(X)
    V_gpu = gpuarray.to_gpu(Vr)
    W_gpu = gpuarray.to_gpu(Wr)
    #Frobinius norm at previous step
    B_gpu = linalg.dot(V_gpu, W_gpu)
    L = linalg.norm(X_gpu-B_gpu)**2
    iteration = 0
    while 1: #update V
        V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu))
        V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu))
        B_gpu = linalg.dot(V_gpu, W_gpu)
        #update W
        W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu)
        W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu)
        B_gpu = linalg.dot(V_gpu, W_gpu)
        Lnew = linalg.norm(X_gpu-B_gpu)**2
        if abs(Lnew-L) <= tol*(L+1):
Example #31
0
def fusion_images(multispectral,
                  panchromatic,
                  save_image=False,
                  savepath=None,
                  timeCondition=True):
    #Verifica que ambas imagenes cumplan con las condiciones
    end = 0
    start = 0
    #Verifica que ambas imagenes cumplan con las condiciones
    if multispectral.shape[2] == 3:
        print('The Multispectral image has ' + str(multispectral.shape[2]) +
              ' channels and size of ' + str(multispectral.shape[0]) + 'x' +
              str(multispectral.shape[1]))
    else:
        sys.exit('The first image is not multispectral')

    if len(panchromatic.shape) == 2:
        print(' The Panchromatic image has a size of ' +
              str(panchromatic.shape[0]) + 'x' + str(panchromatic.shape[1]))
    else:
        sys.exit('The second image is not panchromatic')

    # Convierte a float32 y separa las bandas RGB de la multispectral
    multispectral = multispectral.astype(np.float32)
    r = multispectral[:, :, 0].astype(np.float32)
    g = multispectral[:, :, 1].astype(np.float32)
    b = multispectral[:, :, 2].astype(np.float32)
    b = b.astype(np.float32)
    # Convierte la pancromatica a float32
    panchromatic = panchromatic.astype(np.float32)
    # Suma las bandas de la multispectral
    msuma = r + g + b
    start = time.time()
    r_gpu = gpuarray.to_gpu(r)
    g_gpu = gpuarray.to_gpu(g)
    b_gpu = gpuarray.to_gpu(b)
    panchromatic_gpu = gpuarray.to_gpu(panchromatic)
    msuma_gpu = gpuarray.to_gpu(msuma)

    linalg.init()
    m11_gpu = step_1(r_gpu, msuma_gpu)
    m22_gpu = step_2(m11_gpu, panchromatic_gpu)

    m33_gpu = step_1(b_gpu, msuma_gpu)
    m44_gpu = step_2(m33_gpu, panchromatic_gpu)

    m55_gpu = step_1(g_gpu, msuma_gpu)
    m66_gpu = step_2(m55_gpu, panchromatic_gpu)

    Amax_host, Amin_host = step_3(m22_gpu)
    rr_gpu = gpuarray.empty_like(r_gpu)
    step_4(m22_gpu, rr_gpu, Amax_host, Amin_host)

    Amax_host, Amin_host = step_3(m66_gpu)
    gg_gpu = gpuarray.empty_like(g_gpu)
    step_4(m66_gpu, gg_gpu, Amax_host, Amin_host)

    Amax_host, Amin_host = step_3(m44_gpu)
    bb_gpu = gpuarray.empty_like(b_gpu)
    step_4(m44_gpu, bb_gpu, Amax_host, Amin_host)
    end = time.time()

    ggg_host = gg_gpu.get().astype(np.uint8)
    rrr_host = rr_gpu.get().astype(np.uint8)
    bbb_host = bb_gpu.get().astype(np.uint8)

    # Combina las bandas resultantes
    fusioned_image = np.stack((rrr_host, ggg_host, bbb_host), axis=2)
    if (save_image):
        # Guarda la imagen resultando de acuerdo al tercer parametro establecido en la linea de ejecuciĆ³n del script
        if (savepath != None):
            t = skimage.io.imsave(savepath + '/broveygpu_image.tif',
                                  fusioned_image,
                                  plugin='tifffile')
        else:
            t = skimage.io.imsave('broveygpu_image.tif',
                                  fusioned_image,
                                  plugin='tifffile')
    #time_calculated de ejecuciĆ³n para la transformada de Brovey en GPU
    time_calculated = (end - start)
    if (timeCondition):
        return {"image": fusioned_image, "time": time_calculated}
    else:
        return fusioned_image
Example #32
0
import theano
import numpy as np
from pycuda import gpuarray, cumath
from skcuda import linalg, misc, cudart
from theano.tensor import as_tensor_variable
from theano.gof import Op, Apply
import theano.tensor as tt
import pycuda.autoinit
cudart.cudaSetDevice(0)
linalg.init()


class logDect(Op):
    def make_node(self, *inputs):
        alpha = as_tensor_variable(inputs[0])
        xt = as_tensor_variable(inputs[1])
        xf = as_tensor_variable(inputs[2])
        ll = as_tensor_variable(.1)
        return Apply(self, [alpha, xt, xf], [ll.type()])

    def make_thunk(self,
                   node,
                   storage_map,
                   compute_map,
                   rem=None,
                   impl=None,
                   no_recycling=[]):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
        A = gpuarray.to_gpu(self.A)
        b = gpuarray.to_gpu(self.b)
Example #33
0
#!/usr/bin/env python

"""
Demonstrates how to transpose matrices on the GPU.
"""
from __future__ import print_function

import pycuda.autoinit
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import numpy as np

import skcuda.linalg as culinalg
import skcuda.misc as cumisc
culinalg.init()

# Double precision is only supported by devices with compute
# capability >= 1.3:
import string
demo_types = [np.float32, np.complex64]
if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3:
    demo_types.extend([np.float64, np.complex128])

for t in demo_types:
    print('Testing transpose for type ' + str(np.dtype(t)))
    if np.iscomplexobj(t()):
        b = np.array([[1j, 2j, 3j, 4j, 5j, 6j],
                      [7j, 8j, 9j, 10j, 11j, 12j]], t)
    else:
        a = np.array([[1, 2, 3, 4, 5, 6],
                      [7, 8, 9, 10, 11, 12]], t)
Example #34
0
def elmvis(Xraw,
           A,
           slowdown=10,
           report=5,
           maxtime=24*60*60,
           tol=0,
           batch=None,
           maxiter=None,
           maxupdate=None,
           maxstall=None,
           cossim=None,
           silent=False):
    """ELMVIS+ function running in GPU memory.
    """
    X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None]  # unit-length version of X
    Xh = np.dot(A, X)  # X_hat, predicted value of X
    N, d = X.shape
    I = np.arange(N)  # index of samples

    # set default values
    if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N
    if maxiter is None: maxiter = N*N*N
    if maxupdate is None: maxupdate = N*N
    if maxstall is None: maxstall = N*N

    if not silent:
        print "original similarity: ", cossim

    # init GPU
    dt = X.dtype.type
    try:
        linalg.init()
    except ImportError as e:
        print e
    devA = gpuarray.to_gpu(A.astype(dt))
    devX = gpuarray.to_gpu(X.astype(dt))
    devXi1 = gpuarray.empty((d,), dtype=dt)
    devXh = linalg.dot(devA, devX)
    devAi = gpuarray.empty((N, 2), dtype=dt)
    devDelta = gpuarray.empty((2, d), dtype=dt)
    result = gpuarray.empty((d,), dtype=dt)

    # swap kernel
    kernel = """
        __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) {
            long j = blockDim.x * blockIdx.x + threadIdx.x;
            %s yi1 = Y[i1*d + j];
            %s yi2 = Y[i2*d + j];
            result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) +
                        (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2);
        }
        """
    if dt is np.float64:
        kernel = kernel % ("double", "double", "double", "double", "double", "double")
    else:
        kernel = kernel % ("float", "float", "float", "float", "float", "float")
    mod_diff = SourceModule(kernel)
    dev_diff = mod_diff.get_function("diff")
    dev_diff.prepare("PPPPllll")
    block = result._block
    grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1)

    t0 = tlast = time()
    stall = 0
    iters = 0
    updates = 0
    updates_last = 0
    iters_last = 0
    ups_max = 0

    while (iters < maxiter) and (stall < maxstall):
        iters += 1
        stall += 1

        # get two different random numbers
        i1, i2 = np.random.randint(0, N, size=2)
        while i1 == i2:
            i1, i2 = np.random.randint(0, N, size=2)

        dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2)
        diff = np.sum(result.get())

        if diff > tol:
            stall = 0
            devAi[:, 0] = devA[:, i1]
            devAi[:, 1] = devA[:, i2]
            devDelta[0, :] = devX[i1, :] - devX[i2, :]
            devDelta[1, :] = devX[i2, :] - devX[i1, :]
            linalg.add_dot(devAi, devDelta, devXh, alpha=-1)

            tI = I[i1]
            I[i1] = I[i2]
            I[i2] = tI

            devXi1[:] = devX[i1, :]
            devX[i1] = devX[i2]
            devX[i2] = devXi1

            cossim += diff / N
            updates += 1
            if updates > maxupdate:
                break

        t = time()
        if t - tlast > report:
            ups = (updates-updates_last)*1.0/(t-tlast)
            ips = (iters-iters_last)*1.0/(t-tlast)
            if not silent:
                print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (iters, updates, ips, ups, cossim)

            updates_last = updates
            iters_last = iters
            tlast = t
            ups_max = max(ups, ups_max)
            if ups < ups_max/slowdown:
                break

        if t - t0 > maxtime:
            break

    ips = iters*1.0/(time()-t0)
    ups = updates*1.0/(time()-t0)
    Xraw[:] = Xraw[I]

    cossim = np.trace(X.T.dot(A).dot(X)) / N
    if not silent:
        print "final similarity: ", cossim

    info = {'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups}
    return I, info
Example #35
0
 def setUp(self):
     np.random.seed(0)
     linalg.init()
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda.reduction import ReductionKernel
from pycuda.tools import dtype_to_ctype
import numpy as np
from fractions import gcd
from types import MethodType, FunctionType
import cufft

import skcuda.linalg as cu_linalg
cu_linalg.init()


class RhoVNeumannCUDA1D:
    """
    The second-order split-operator propagator for the von Neumann equation for the denisty matrix rho(x,x',t)
    with the time-dependent Hamiltonian H = K(p, t) + V(x, t).

    The Wigner function is obtained by padded Wigner transforming the (rectangular) density matrix.
    """
    def __init__(self, **kwargs):
        """
        The following parameters are to be specified
            X_gridDIM - the coordinate grid size
            X_amplitude - maximum value of the coordinates

            t (optional) - initial value of time (default t = 0)
            consts (optional) - a string of the C code declaring the constants
            functions (optional) -  a string of the C code declaring auxiliary functions