def __init__(self):        
        culinalg.init()
        self.handle = cublas.cublasCreate()
        self._elem_kernel = culinalg_kernel.get_function('_elem')
        self._sigmoid_kernel = culinalg_kernel.get_function('_sigmoid')
        self._log_anti_sigmoid_kernel = culinalg_kernel.get_function('_log_anti_sigmoid')
        self._tanh_kernel = culinalg_kernel.get_function('_tanh')
        self._pow_kernel = culinalg_kernel.get_function('_pow')
        self._sqrt_kernel = culinalg_kernel.get_function('_sqrt')
        self._square_kernel = culinalg_kernel.get_function('_square')
        self._exp_kernel = culinalg_kernel.get_function('_exp')
        self._log_kernel = culinalg_kernel.get_function('_log')
        self._sum_kernel = culinalg_kernel.get_function('_sum')
        self._compare_kernel = culinalg_kernel.get_function('_compare')
        self._reverse_kernel = culinalg_kernel.get_function('_reverse')
        self.X_max_kernel = culinalg_kernel.get_function('X_max')
        self.X_min_kernel = culinalg_kernel.get_function('X_min')
        self.X_sum_kernel = culinalg_kernel.get_function('X_sum')
        self.X_norm_kernel = culinalg_kernel.get_function('X_norm')
        self.s_mul_x_kernel = culinalg_kernel.get_function('s_mul_x')
        self.s_add_x_kernel = culinalg_kernel.get_function('s_add_x')        
        self.x_add_y_kernel = culinalg_kernel.get_function('x_add_y')
        self.X_add_Y_kernel = culinalg_kernel.get_function('X_add_Y')
        self.x_mul_y_kernel = culinalg_kernel.get_function('x_mul_y')
        self.X_mul_Y_kernel = culinalg_kernel.get_function('X_mul_Y')
        self.x_div_y_kernel = culinalg_kernel.get_function('x_div_y')
        self.X_div_Y_kernel = culinalg_kernel.get_function('X_div_Y')

        self.x_radd_Y_as_Y_kernel = culinalg_kernel.get_function('x_radd_Y_as_Y')
        self.x_cadd_Y_as_Y_kernel = culinalg_kernel.get_function('x_cadd_Y_as_Y')
        self.x_rmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_rmul_Y_as_Y')
        self.x_cmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_cmul_Y_as_Y')        
        self.x_radd_Y_as_x_kernel = culinalg_kernel.get_function('x_radd_Y_as_x')
        self.x_cadd_Y_as_x_kernel = culinalg_kernel.get_function('x_cadd_Y_as_x')
        self.x_outer_y_add_O_kernel = culinalg_kernel.get_function('x_outer_y_add_O')
        self.X_router_Y_add_O_kernel = culinalg_kernel.get_function('X_router_Y_add_O')
        self.X_rdot_Y_kernel = culinalg_kernel.get_function('X_rdot_Y')
        
        self.index_to_array_kernel = culinalg_kernel.get_function('index_to_array')

        self._2d_block = (32, 32, 1)
        self._1d_block = (1024, 1, 1)
        self._3d_block = (16, 16, 4)
Example #2
0
        from .parallel import get_id_within_node
        gpuid = get_id_within_node()
        import pycuda.driver
        pycuda.driver.init()
        if gpuid >= pycuda.driver.Device.count():
            print '[' + MPI.Get_processor_name(
            ) + '] more processes than the GPU numbers!'
            #MPI.COMM_WORLD.Abort()
            raise
        gpu_device = pycuda.driver.Device(gpuid)
        gpu_context = gpu_device.make_context()
        gpu_initialized = True
    else:
        import pycuda.autoinit
        gpu_initialized = True
except:
    pass

try:
    from scikits.cuda import cublas
    import scikits.cuda.linalg as culinalg
    culinalg.init()
    cublas_handle = cublas.cublasCreate()
except:
    pass


def closeGPU():
    if gpu_context is not None:
        gpu_context.detach()
Example #3
0
 def setUp(self):
     np.random.seed(0)
     linalg.init()
Example #4
0
File: batchtps.py Project: rll/lfd
#!/usr/bin/env python

from __future__ import division
import h5py
import sys

import numpy as np
import scipy.spatial.distance as ssd
import pycuda.driver as drv
from pycuda import gpuarray
from scikits.cuda import linalg

from lfd.tpsopt import tps

linalg.init()

from lfd.tpsopt.tps import tps_kernel_matrix, tps_eval
from lfd.tpsopt.culinalg_exts import dot_batch_nocheck, get_gpu_ptrs
from lfd.tpsopt.precompute import downsample_cloud, batch_get_sol_params
from cuda_funcs import (
    init_prob_nm,
    norm_prob_nm,
    get_targ_pts,
    check_cuda_err,
    fill_mat,
    reset_cuda,
    sq_diffs,
    closest_point_cost,
    scale_points,
    gram_mat_dist,
)
Example #5
0
import pycuda.driver as cuda
import pycuda.compiler as compiler
import pycuda.tools as tools
from pycuda.compiler import SourceModule

import numpy as np

import scikits.cuda.linalg as culinalg
import scikits.cuda.misc as cumisc
import string

from ctypes import *
cdll.LoadLibrary("/usr/local/lib/libCudaKernelLibrary.so")
kmeansLib = CDLL("/usr/local/lib/libCudaKernelLibrary.so")

culinalg.init()

# Double precision is only supported by devices with compute
# capability >= 1.3:
demo_types = [np.float32]
if cumisc.get_compute_capability(pycuda.autoinit.device) >= 5.0:
    demo_types.extend([np.float64])

for t in demo_types:
    np.random.seed(seed=42)
    m = 899946
    n = 129
    k = 1024

    print 'Testing matrix multiplication for type ' + str(np.dtype(t))
    a = np.asarray(np.random.rand(m,n), t)
Example #6
0
 def setUp(self):
     linalg.init()
Example #7
0
from abc import ABCMeta, abstractmethod

import numpy as np
import time as t

import pycuda.driver as cuda
from pycuda import gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel
import pycuda.curandom as curandom
import pycuda.cumath as cumath

import scikits.cuda.linalg as linalg
linalg.init()


class Layer:
    __metaclass__ = ABCMeta
    
    @abstractmethod
    def updateOutputs(self,inputs): pass
    
    @abstractmethod
    def updateGradient(self,previous_grad,include_prior): pass
    
    @abstractmethod
    def setWeights(self,new_weights): pass
            
    @abstractmethod
Example #8
0
import time
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np
import scikits.cuda.linalg as cla
import numpy.linalg as la
import scikits.cuda.cula as cula

cla.init()

def testForSize(x):
    print 'Image Size %dx%d' % (x,x)

    x = np.random.rand(x**2, 40).astype(np.float32)

    def svdoverwrite(a_gpu, u_gpu, s_gpu, v_gpu, m, n, lda, ldu, ldvt):
        data_type = a_gpu.dtype.type
        real_type = np.float32
        cula_func = cula._libcula.culaDeviceSgesvd
        jobu = 'S'
        jobvt = 'S'

        status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata),
                           lda, int(s_gpu.gpudata), int(u_gpu.gpudata),
                           ldu, int(v_gpu.gpudata), ldvt)

        cula.culaCheckStatus(status)

        # Free internal CULA memory:
        cula.culaFreeBuffers()
Example #9
0
 def setUp(self):
     linalg.init()
Example #10
0
def gaussian_process(data, feedback, feedback_indices, float_type=np.float32, int_type=np.int32,
                     kernel_file=None, debug=False, K_noise=None,
                     K_xx_noise=None):
    #t = time.time()
    if kernel_file == None:
        kernel_file = os.path.dirname(os.path.realpath(__file__)) + '/kernels.c'
    if debug:
        print("Initialized starts")
        print("Loading test data")
#        with open('feedback.txt') as infile:
#            feedback = np.loadtxt(infile)
#        with open('feat.txt') as infile:
#            data = np.loadtxt(infile)
#        with open('feedback_idx.txt') as infile:
#            feedback_indices = np.loadtxt(infile)
        np.set_printoptions(linewidth=500)
    import pycuda.autoinit

    float_type = float_type
    int_type = int_type
    block_size = (8, 8, 16)
    n_features = np.int32(np.size(data, 1))  # TODO: Assuming the n_features is divisible by block_size[2]
    cuda_module = open(kernel_file, 'r').read()
    try:
        cuda_module = SourceModule(cuda_module)
    except Exception as e:
        print(e)

    # Inialize variables
    # Pad everything to match block size
    # Add zero row to the beginning of feature matrix for zero padding in cuda operations TODO: is this necessary?
    n_total = int_type(np.size(data, 0))
    data = np.asfarray(data, dtype=float_type)
    n_feedback = np.size(feedback_indices, 0)
    n_feedback_padded = round_up_to_blocksize(n_feedback, block_size, int_type)  # Pad to match block size
    feedback_indices = np.asarray(feedback_indices, dtype=int_type)
    predict_indices = np.setdiff1d(np.array([i for i in range(n_total)]), feedback_indices)
    n_predict = int_type(len(data) - len(feedback_indices))
    n_predict_padded = round_up_to_blocksize(n_predict, block_size, int_type)
    feedback_indices = pad_vector(feedback_indices, n_feedback, n_feedback_padded, dtype=int_type)
    predict_indices = pad_vector(predict_indices, n_predict, n_predict_padded, dtype=int_type)
    K = np.zeros((n_feedback_padded, n_feedback_padded), dtype=float_type)
    K_x = np.zeros((n_predict_padded, n_feedback_padded), dtype=float_type)
    K_xK = np.zeros((n_predict_padded, n_feedback_padded), dtype=float_type)
    if K_noise is None:
        K_noise = np.random.normal(1, 0.1, n_feedback)  # Generate diagonal noise
    # Save K diagonal noise

    K_noise = pad_vector(K_noise, n_feedback, n_feedback_padded, dtype=float_type)
    K_inv = np.asfarray(K, dtype=float_type)
    diag_K_xx = None
    if K_xx_noise is None:
        diag_K_xx = np.random.normal(1, 0.1, n_predict)
    else:
        diag_K_xx = K_xx_noise
    # Save K_xx random noise

    diag_K_xx = pad_vector(diag_K_xx, n_predict, n_predict_padded, dtype=float_type)
    diag_K_xKK_x_T = np.zeros((1, n_predict_padded), dtype=float_type)
    variance = np.zeros((1, n_predict_padded), dtype=float_type)
    feedback = np.array(feedback)
    feedback = pad_vector(feedback, n_feedback, n_feedback_padded, dtype=float_type)
    mean = np.zeros((1, n_predict_padded), dtype=float_type)

    # Allocate GPU memory and copy data, check datatype before each allocation
    # TODO: add dimension checking
    check_type(data, float_type)
    data_gpu = drv.mem_alloc(data.nbytes)
    drv.memcpy_htod(data_gpu, data)
    check_type(feedback_indices, int_type)
    feedback_indices_gpu = drv.mem_alloc(feedback_indices.nbytes)
    drv.memcpy_htod(feedback_indices_gpu, feedback_indices)
    check_type(K, float_type)
    K_gpu = drv.mem_alloc(K.nbytes)
    drv.memcpy_htod(K_gpu, K)
    check_type(K_inv, float_type)
    K_inv_gpu = drv.mem_alloc(K_inv.nbytes)
    check_type(K_noise, float_type)
    K_noise_gpu = drv.mem_alloc(K_noise.nbytes)
    drv.memcpy_htod(K_noise_gpu, K_noise)
    check_type(K_x, float_type)
    K_x_gpu = drv.mem_alloc(K_x.nbytes)
    drv.memcpy_htod(K_x_gpu, K_x)
    check_type(predict_indices, int_type)
    predict_indices_gpu = drv.mem_alloc(predict_indices.nbytes)
    drv.memcpy_htod(predict_indices_gpu, predict_indices)
    check_type(K_xK, float_type)
    K_xK_gpuarr = drv.mem_alloc(K_xK.nbytes)
    check_type(diag_K_xx, float_type)
    diag_K_xx_gpu = drv.mem_alloc(diag_K_xx.nbytes)
    drv.memcpy_htod(diag_K_xx_gpu, diag_K_xx)
    check_type(diag_K_xKK_x_T, float_type)
    diag_K_xKK_x_T_gpu = drv.mem_alloc(diag_K_xKK_x_T.nbytes)
    drv.memcpy_htod(diag_K_xKK_x_T_gpu, diag_K_xKK_x_T)
    check_type(variance, float_type)
    variance_gpu = drv.mem_alloc(variance.nbytes)
    check_type(feedback, float_type)
    feedback_gpu = drv.mem_alloc(feedback.nbytes)
    drv.memcpy_htod(feedback_gpu, feedback)
    check_type(mean, float_type)
    mean_gpu = drv.mem_alloc(mean.nbytes)

    # Initialization done
    # Actual GP calculations begin here
    calc_K(cuda_module, block_size, n_features, n_feedback_padded, data_gpu, feedback_indices_gpu, K_noise_gpu, K_gpu)
    drv.memcpy_dtoh(K, K_gpu)
    if debug:
        K_test_features = np.asfarray([data[i] for i in feedback_indices], dtype=float_type)
        K_test = dist.cdist(K_test_features, K_test_features, 'cityblock') / n_features + np.diag(K_noise)
        drv.memcpy_dtoh(K, K_gpu)
        check_result('K', K[:n_feedback, :n_feedback], K_test[:n_feedback, :n_feedback])

    K_inv = invert_K(n_feedback, n_feedback_padded, float_type, K_gpu, K_inv_gpu)

    calc_K_x(cuda_module, block_size, n_feedback_padded, n_predict_padded, n_features, feedback_indices_gpu,
             predict_indices_gpu, data_gpu, K_x_gpu)
    drv.memcpy_dtoh(K_x, K_x_gpu)
    if debug:
        K_x_test = np.zeros((n_predict_padded, n_feedback_padded), dtype=float_type)
        for i, idx1 in enumerate(predict_indices):
            for j, idx2 in enumerate(feedback_indices):
                vdist = distance(data[idx1], data[idx2]) / len(data[0])
                K_x_test[i][j] = vdist
        check_result('K_x', K_x[:n_predict, :n_feedback], K_x_test[:n_predict, :n_feedback])

    linalg.init()
    K_inv_gpuarr = gpuarray.to_gpu(K_inv.astype(float_type))
    K_x_gpuarr = gpuarray.to_gpu(K_x.astype(float_type))
    K_xK_gpuarr = linalg.dot(K_x_gpuarr, K_inv_gpuarr)
    K_xK = K_xK_gpuarr.get()

    if debug:
        #drv.memcpy_dtoh(K_xK, K_xK_gpuarr)
        K_xK_test = (np.matrix(K_x) * np.matrix(K_inv))
        check_result('K_xK', K_xK[:n_predict, :n_feedback], K_xK_test[:n_predict, :n_feedback])
        print(K_xK.shape)

    calc_K_xKK_x_T(cuda_module, block_size, n_feedback_padded, n_predict_padded, K_xK_gpuarr, K_x_gpu, diag_K_xKK_x_T_gpu)
    drv.memcpy_dtoh(diag_K_xKK_x_T, diag_K_xKK_x_T_gpu)
    drv.memcpy_dtoh(diag_K_xKK_x_T, diag_K_xKK_x_T_gpu)
    if debug:
        K_xKK_x_T_test = np.diag(np.matrix(K_xK) * np.matrix(K_x).T)
        check_result("K_xKK_x_T", diag_K_xKK_x_T, K_xKK_x_T_test)

    calc_variance(cuda_module, block_size, n_predict_padded, diag_K_xx_gpu, diag_K_xKK_x_T_gpu, variance_gpu)
    drv.memcpy_dtoh(variance, variance_gpu)
    drv.memcpy_dtoh(variance, variance_gpu)
    if debug:
        variance_test = np.abs(np.subtract(diag_K_xx[:n_predict], diag_K_xKK_x_T[:, :n_predict]))
        check_result('Variance', variance[:, :n_predict], variance_test[:, :n_predict])
    feedback = np.atleast_2d(feedback).T
    feedback_gpuarr = gpuarray.to_gpu(feedback.astype(float_type))
    mean_gpuarr = linalg.dot(K_xK_gpuarr, feedback_gpuarr)
    mean = mean_gpuarr.get()
    #mean = np.dot(K_xK, feedback)
    if debug:
        #drv.memcpy_dtoh(mean, mean_gpu)
        mean_test = np.dot(K_xK, feedback)
        check_result('Mean', mean[:n_predict], mean_test[:n_predict])

    if debug:
        # Calculate full result
        feedback = feedback[:n_feedback]
        feedback_indices = feedback_indices[:n_feedback]
        predict_indices = predict_indices[:n_predict]
        data = data
        test_K_noise = K_noise[:n_feedback]
        test_K_xx = diag_K_xx[:n_predict]
        test_K_features = np.asfarray([data[i] for i in feedback_indices], dtype=float_type)
        test_K = dist.cdist(test_K_features, test_K_features, 'cityblock') / n_features + np.diag(test_K_noise)
        test_K_inv = np.linalg.inv(test_K[:n_feedback, :n_feedback])
        test_K_x = np.zeros((n_predict, n_feedback), dtype=float_type)
        for i, idx1 in enumerate(predict_indices):
            for j, idx2 in enumerate(feedback_indices):
                vdist = distance(data[idx1], data[idx2]) / len(data[0])
                test_K_x[i][j] = vdist
        test_K_xK = np.dot(test_K_x, test_K_inv)
        test_K_xKK_x_T = np.diag(np.dot(test_K_xK, test_K_x.T))
        test_variance = np.sqrt(np.abs(np.subtract(test_K_xx, test_K_xKK_x_T)))
        test_mean = np.dot(test_K_xK, feedback)

        print(np.allclose(variance.flatten()[:n_predict], test_variance))
        print(np.allclose(mean[:n_predict], test_mean))

        print('Variance isclose True count:', sum(np.isclose(variance.flatten()[:n_predict], test_variance)))
        print('Mean isclose True count:', sum(np.isclose(mean.flatten()[:n_predict], test_mean)))
        print('Mean differences (first 10):', np.subtract(mean.flatten()[:10], test_mean[:10]))
        print(mean.flatten()[:10])
        print(test_mean[:10])

    # Write results to files for testing
    mean = mean.flatten()[:n_predict]
    variance = variance.flatten()[:n_predict]
    #print(time.time() - t)
    return mean, variance