Esempio n. 1
0
def transpose(a):
    '''
    https://github.com/lebedov/scikit-cuda/issues/33
    pip install --upgrade --no-deps git+https://github.com/lebedov/scikits.cuda.git

    :return:
    '''
    import time
    import numpy as np
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import scikits.cuda.cublas as cublas

    handle = cublas.cublasCreate()
    # N = 1000
    # a = np.random.rand(N, N)
    R =  a.shape[0]
    C = a.shape[1]
    a_gpu = gpuarray.to_gpu(a)
    a_trans_gpu = gpuarray.zeros((C, R), dtype=np.double)
    alpha = 1.0
    beta = 0.0
    start = time.time()
    cublas.cublasDgeam(handle, 't', 'n', R, R,
                       alpha, a_gpu.gpudata, R,
                       beta, a_gpu.gpudata, R,
                       a_trans_gpu.gpudata, R)
    print time.time()-start
    # assert np.allclose(a_trans_gpu.get(), a.T)
    cublas.cublasDestroy(handle)

    return a_trans_gpu
Esempio n. 2
0
def calc_x(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh):
    handle = cb.cublasCreate()
    
    if not rp1 is None:
        rp1 = garr.to_gpu(sp.asarray(rp1))
    if not lm2 is None:
        lm2 = garr.to_gpu(sp.asarray(lm2))
    
    lm1_s = garr.to_gpu(sp.asarray(lm1_s))
    lm1_si = garr.to_gpu(sp.asarray(lm1_si))
    
    r_s = garr.to_gpu(sp.asarray(r_s))
    r_si = garr.to_gpu(sp.asarray(r_si))
    
    A = list(map(garr.to_gpu, A))
    if not Am1 is None:
        Am1 = list(map(garr.to_gpu, Am1))
    if not Ap1 is None:
        Ap1 = list(map(garr.to_gpu, Ap1))
    
    Vsh = list(map(garr.to_gpu, Vsh))
    
    if not Cm1 is None:
        Cm1 = [[garr.to_gpu(Cm1[t, s]) for t in range(Cm1.shape[1])] for s in range(Cm1.shape[0])]
        
    if not (C is None and Kp1 is None):
        C = [[garr.to_gpu(C[s, t]) for t in range(C.shape[1])] for s in range(C.shape[0])]
        Kp1 = garr.to_gpu(Kp1)
    
    x = calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=handle)
        
    cb.cublasDestroy(handle)
    
    return x.get()
Esempio n. 3
0
def classify(image_names, model_file_name, output_names):
    """
    Classify a set of images using the given model.
    
    Parameters
    ----------
    image_names : iterable of strings
        names of the input images
    model_file_name : string
        name of the file containing the model
    output_names : iterable of strings
        names of the output images
    
    Notes
    -----
    image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx]
    """
    handle = cublas.cublasCreate()
    model = serial.load(model_file_name)
    outputs = []
    for image_name, output_name in zip(image_names, output_names):
        image = load_image(image_name)
        output = classify_image(image, model, handle)
        save_image(np.int32(np.round(output*255)), output_name)
    cublas.cublasDestroy(handle)
Esempio n. 4
0
    def __init__(self, A1, A2, left, use_batch=False):
        """Creates a new LinearOperator interface to the superoperator E.
        
        This is a wrapper to be used with SciPy's sparse linear algebra routines.
        
        Parameters
        ----------
        A1 : ndarray
            Ket parameter tensor. 
        A2 : ndarray
            Bra parameter tensor.
        left : bool
            Whether to multiply with a vector to the left (or to the right).
        """
        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))

        self.use_batch = use_batch
        self.left = left

        self.D = A1[0].shape[1]
        self.shape = (self.D**2, self.D**2)
        self.dtype = sp.dtype(A1[0][0].dtype)

        self.calls = 0

        self.out = garr.empty((self.D, self.D), dtype=self.dtype)
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)

        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None

            self.ones = [
                garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))
            ]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [
                garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))
            ]

            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())

        self.hdl = cb.cublasCreate()
Esempio n. 5
0
def _initialize_cublas():
  global sgemm

  try:
    cublas.cublasInit()
    sgemm = cublas.cublasSgemm
  except AttributeError:
    handle = cublas.cublasCreate()
    def sgemm(*args):
      cublas.cublasSgemm(handle, *args)
Esempio n. 6
0
    def __init__(self, A1, A2, left, use_batch=False):
        """Creates a new LinearOperator interface to the superoperator E.
        
        This is a wrapper to be used with SciPy's sparse linear algebra routines.
        
        Parameters
        ----------
        A1 : ndarray
            Ket parameter tensor. 
        A2 : ndarray
            Bra parameter tensor.
        left : bool
            Whether to multiply with a vector to the left (or to the right).
        """
        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))
        
        self.use_batch = use_batch
        self.left = left
        
        self.D = A1[0].shape[1]        
        self.shape = (self.D**2, self.D**2)        
        self.dtype = sp.dtype(A1[0][0].dtype)
        
        self.calls = 0        
        
        self.out = garr.empty((self.D, self.D), dtype=self.dtype)        
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)

        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None

            self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            
            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())
        
        self.hdl = cb.cublasCreate()
Esempio n. 7
0
def _initialize_cublas():
    global sgemm

    util.log_info('Initializing cublas.')
    try:
        cublas.cublasInit()
        sgemm = cublas.cublasSgemm
    except AttributeError:
        handle = cublas.cublasCreate()

        def sgemm(*args):
            cublas.cublasSgemm(handle, *args)
Esempio n. 8
0
 def __init__(self, name, gpu_id):
     """
     name: name of the node, can be any arbitrary string
     gpu_id: the integer id of the GPU that this Node should be running on
     """
     Node.__init__(self, name)
     self.ctx = driver.Device(gpu_id).make_context()
     self.device = self.ctx.get_device()
     print 'Executing on device at PCI ID:', self.device.pci_bus_id()
     self.handle = cublas.cublasCreate()
     self.gpu_id = gpu_id
     self.is_cpu = False
     self.is_gpu = True
Esempio n. 9
0
def classify(image_names, model_file_name, output_names):
    """
    Classify a set of images using the given model.
    
    Parameters
    ----------
    image_names : iterable of strings
        names of the input images
    model_file_name : string
        name of the file containing the model
    output_names : iterable of strings
        names of the output images
    
    Notes
    -----
    image_names and output_names should have the same length and indices match. i.e. image_names[idx] -> output_names[idx]
    This network copies the weights to the gpu once to classify all the images as it should. This can be used as a model 
    to make the same change to the fully connected network.
    """
    handle = cublas.cublasCreate()
    model = serial.load(model_file_name)

    layers = model.layers
    convs = layers[:-1]; softmax = layers[-1];
    convs = map(lambda layer: layer.get_params(), convs)
    kernels = map(lambda layer: np.array(layer[0].eval()), convs)

    #This can be simplified
    kernels = map(lambda kernel: np.ascontiguousarray(np.rollaxis(kernel, 0, 3)), kernels)
    kdims = map(lambda kernel: kernel.shape, kernels)
    kernels = map(lambda layer: layer[0].dimshuffle(3, 0, 1, 2).eval(), convs)
    kernels = map(lambda kernel, kdim: kernel.reshape(kdim), kernels, kdims)
    
    
    biases = map(lambda layer: np.array(layer[1].eval()), convs)
    bias_dims = map(lambda bias: bias.shape, biases)
    max_sizes = map(lambda layer: layer.pool_shape + [layer.num_pieces], layers[:-1])
    
    weights = softmax.get_params()[1]; bias = softmax.get_params()[0];
    
    soft_weights = softmax.get_params()[1].reshape((3, 3, 32, 2)).dimshuffle(3, 2, 0, 1).eval()
    soft_weights = np.ascontiguousarray(np.reshape(soft_weights, (2, 288)).transpose())
    soft_bias = softmax.get_params()[0].get_value()[::1]

    window = layers[0].input_space.shape
    outputs = []
    for image_name, output_name in zip(image_names, output_names):
        image = load_image(image_name)
        output = classify_image(image, model, kernels, biases, max_sizes, soft_weights, soft_bias, window, handle)
        save_image(np.int8(np.round(output*255)), output_name)
    cublas.cublasDestroy(handle)
Esempio n. 10
0
    def __init__(self, p, A1, A2, l=None, r=None, left=False, pseudo=True, use_batch=False):
        assert not (pseudo and (l is None or r is None)), 'For pseudo-inverse l and r must be set!'
        
        self.use_batch = use_batch
        self.p = p
        self.left = left
        self.pseudo = pseudo
        self.D = A1[0].shape[1]
        self.shape = (self.D**2, self.D**2)
        self.dtype = A1[0].dtype
        
        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))

        self.l = l
        self.r = r
        self.lG = garr.to_gpu(sp.asarray(l))
        self.rG = garr.to_gpu(sp.asarray(r))
        
        self.out = garr.empty((self.D, self.D), dtype=self.dtype)
        self.out2 = garr.empty((self.D, self.D), dtype=self.dtype)
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)
        
        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
            self.out2_p = get_batch_ptrs([self.out2] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None
            self.out2_p = None

            self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            
            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())
        
        self.hdl = cb.cublasCreate()
Esempio n. 11
0
def calc_x(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh):
    handle = cb.cublasCreate()

    if not rp1 is None:
        rp1 = garr.to_gpu(sp.asarray(rp1))
    if not lm2 is None:
        lm2 = garr.to_gpu(sp.asarray(lm2))

    lm1_s = garr.to_gpu(sp.asarray(lm1_s))
    lm1_si = garr.to_gpu(sp.asarray(lm1_si))

    r_s = garr.to_gpu(sp.asarray(r_s))
    r_si = garr.to_gpu(sp.asarray(r_si))

    A = list(map(garr.to_gpu, A))
    if not Am1 is None:
        Am1 = list(map(garr.to_gpu, Am1))
    if not Ap1 is None:
        Ap1 = list(map(garr.to_gpu, Ap1))

    Vsh = list(map(garr.to_gpu, Vsh))

    if not Cm1 is None:
        Cm1 = [[garr.to_gpu(Cm1[t, s]) for t in range(Cm1.shape[1])]
               for s in range(Cm1.shape[0])]

    if not (C is None and Kp1 is None):
        C = [[garr.to_gpu(C[s, t]) for t in range(C.shape[1])]
             for s in range(C.shape[0])]
        Kp1 = garr.to_gpu(Kp1)

    x = calc_x_G(Kp1,
                 C,
                 Cm1,
                 rp1,
                 lm2,
                 Am1,
                 A,
                 Ap1,
                 lm1_s,
                 lm1_si,
                 r_s,
                 r_si,
                 Vsh,
                 handle=handle)

    cb.cublasDestroy(handle)

    return x.get()
Esempio n. 12
0
    def make_thunk(self, node, storage_map, _, _2):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        num_streams = 32  # 32

        handle = [cublas.cublasCreate()]
        stream_pool = [pycuda.driver.Stream() for _ in xrange(num_streams)]
        current_stream = [0]

        def thunk():
            x = inputs[0]
            y = inputs[1]

            # chop off the real/imag dimension
            input_shape_x = x[0].shape  # (a, b, 2)
            input_shape_y = y[0].shape  # (b, c, 2)

            output_shape = (input_shape_x[0], input_shape_y[1], 2)  # (a, c, 2)

            input_x_pycuda = to_complex_gpuarray(x[0])
            input_y_pycuda = to_complex_gpuarray(y[0])

            # multistream experiment
            # print "DEBUG: Setting stream to %d" % current_stream[0]

            # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams]
            # print "PREV STREAM IS DONE?"
            # print prev_stream_obj.is_done()
            # print

            stream_obj = stream_pool[current_stream[0]]
            cublas.cublasSetStream(handle[0], stream_obj.handle)
            current_stream[0] += 1
            current_stream[0] %= num_streams
            # print "DEBUG: set next stream id to %d" % current_stream[0]

            output_pycuda = linalg.dot(input_x_pycuda,
                                       input_y_pycuda,
                                       handle=handle[0])

            outputs[0][0] = to_complex_cudandarray(output_pycuda)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk
Esempio n. 13
0
def get_cublas_handle():
    """Gets CUBLAS handle for the current device.

    Returns:
        CUBLAS handle.

    """
    global _cublas_handles

    device = Context.get_device()
    if device in _cublas_handles:
        return _cublas_handles[device]

    handle = cublas.cublasCreate()
    _cublas_handles[device] = handle
    return handle
Esempio n. 14
0
def get_cublas_handle():
    """Gets CUBLAS handle for the current device.

    Returns:
        CUBLAS handle.

    """
    global _cublas_handles

    device = Context.get_device()
    if device in _cublas_handles:
        return _cublas_handles[device]

    handle = cublas.cublasCreate()
    _cublas_handles[device] = handle
    return handle
Esempio n. 15
0
    def make_thunk(self, node, storage_map, _, _2):
        inputs = [ storage_map[v] for v in node.inputs]
        outputs = [ storage_map[v] for v in node.outputs]

        num_streams = 32 # 32

        handle = [cublas.cublasCreate()]
        stream_pool = [pycuda.driver.Stream() for _ in xrange(num_streams)]
        current_stream = [0]

        def thunk():
            x = inputs[0]
            y = inputs[1]

            # chop off the real/imag dimension
            input_shape_x = x[0].shape # (a, b, 2)
            input_shape_y = y[0].shape # (b, c, 2)

            output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2)

            input_x_pycuda = to_complex_gpuarray(x[0])
            input_y_pycuda = to_complex_gpuarray(y[0])

            # multistream experiment
            # print "DEBUG: Setting stream to %d" % current_stream[0]

            # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams]
            # print "PREV STREAM IS DONE?"
            # print prev_stream_obj.is_done()
            # print

            stream_obj = stream_pool[current_stream[0]]
            cublas.cublasSetStream(handle[0], stream_obj.handle)
            current_stream[0] += 1
            current_stream[0] %= num_streams
            # print "DEBUG: set next stream id to %d" % current_stream[0]

            output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda, handle=handle[0])

            outputs[0][0] = to_complex_cudandarray(output_pycuda)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk
Esempio n. 16
0
def test_with_test_data():
    handle = cublas.cublasCreate()
    # 3 vars model + constant
    XY = np.loadtxt(open("TestData/Y=2X1+3X2+4X3+5_valid_predictors.csv",
                         "rb"),
                    delimiter=",",
                    skiprows=1,
                    dtype=np.float32)
    print "3 vars model, B coefficients"
    print calculate_single_regression(handle, XY)

    # 2 vars model
    XY = np.loadtxt(open("TestData/Y=2X1+3X2+5.csv", "rb"),
                    delimiter=",",
                    skiprows=1,
                    dtype=np.float32)
    print "2 vars model, B coefficients"
    print calculate_single_regression(handle, XY)
Esempio n. 17
0
def main():
    """
    For testing and timing. 
    """
    handle = cublas.cublasCreate() 
    image = np.float32((np.random.rand(1024, 1024) - .5) * 2)
    model = serial.load(model_file_name)
    layers = model.layers
    
    patch_dims = (39, 39)
    #There is a bug that occurs if running with too long a batch_rows_l
    #Most likely a memory allocation issue that is not being reported correctly
    batch_rows_l = [8] 
    batchsizes = map(lambda x: x*(1024-39+1), batch_rows_l)
    pixels = [(x, y) for x in range(1024-39+1) for y in range(1024-39+1)]
    
    #Uncomment to use pylearn2 to classify to check result
    p_output = pylearn2_computation(model, image, patch_dims, batchsizes[0], pixels)
    p_output = np.transpose(p_output)
    num_trials = 1
    for batchsize, batch_rows in zip(batchsizes, batch_rows_l):
        st = time.time()
        for trial in range(num_trials):
            output = gpu_computation(image, patch_dims, batchsize, batch_rows, layers, pixels, handle)
            output = output.get()
        tot = time.time()-st
        print "Batchsize {0}".format(batchsize)
        print "Total time: {0:.4e} seconds".format(tot)
        print "Time per pixel: {0:.4e} seconds".format(tot/len(pixels*num_trials))
        print "Pixels per second: {0:.4e}".format(len(pixels*num_trials)/tot)
    for end in time_ends:
        end.synchronize()
    sgemm_times = map(lambda start, end: end.time_since(start)/1000, time_starts, time_ends)
    tot_sgemm_time = sum(sgemm_times)
    print "Total sgemm time: {0:.4e} seconds\nTotal gflop: {1:.4e}\nGflops: {2:.4e}".format(tot_sgemm_time, sgemm_gflop, sgemm_gflop/tot_sgemm_time)

    #Uncomment to compare results of gpu and pylearn2 classifications 
    #output = output.reshape(1024-39, 1024-39)
    print output, p_output
    
    print np.allclose(p_output[0], output, rtol=1e-04, atol=1e-07)
    cublas.cublasDestroy(handle)
    
    return 
    def __init__(self):        
        culinalg.init()
        self.handle = cublas.cublasCreate()
        self._elem_kernel = culinalg_kernel.get_function('_elem')
        self._sigmoid_kernel = culinalg_kernel.get_function('_sigmoid')
        self._log_anti_sigmoid_kernel = culinalg_kernel.get_function('_log_anti_sigmoid')
        self._tanh_kernel = culinalg_kernel.get_function('_tanh')
        self._pow_kernel = culinalg_kernel.get_function('_pow')
        self._sqrt_kernel = culinalg_kernel.get_function('_sqrt')
        self._square_kernel = culinalg_kernel.get_function('_square')
        self._exp_kernel = culinalg_kernel.get_function('_exp')
        self._log_kernel = culinalg_kernel.get_function('_log')
        self._sum_kernel = culinalg_kernel.get_function('_sum')
        self._compare_kernel = culinalg_kernel.get_function('_compare')
        self._reverse_kernel = culinalg_kernel.get_function('_reverse')
        self.X_max_kernel = culinalg_kernel.get_function('X_max')
        self.X_min_kernel = culinalg_kernel.get_function('X_min')
        self.X_sum_kernel = culinalg_kernel.get_function('X_sum')
        self.X_norm_kernel = culinalg_kernel.get_function('X_norm')
        self.s_mul_x_kernel = culinalg_kernel.get_function('s_mul_x')
        self.s_add_x_kernel = culinalg_kernel.get_function('s_add_x')        
        self.x_add_y_kernel = culinalg_kernel.get_function('x_add_y')
        self.X_add_Y_kernel = culinalg_kernel.get_function('X_add_Y')
        self.x_mul_y_kernel = culinalg_kernel.get_function('x_mul_y')
        self.X_mul_Y_kernel = culinalg_kernel.get_function('X_mul_Y')
        self.x_div_y_kernel = culinalg_kernel.get_function('x_div_y')
        self.X_div_Y_kernel = culinalg_kernel.get_function('X_div_Y')

        self.x_radd_Y_as_Y_kernel = culinalg_kernel.get_function('x_radd_Y_as_Y')
        self.x_cadd_Y_as_Y_kernel = culinalg_kernel.get_function('x_cadd_Y_as_Y')
        self.x_rmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_rmul_Y_as_Y')
        self.x_cmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_cmul_Y_as_Y')        
        self.x_radd_Y_as_x_kernel = culinalg_kernel.get_function('x_radd_Y_as_x')
        self.x_cadd_Y_as_x_kernel = culinalg_kernel.get_function('x_cadd_Y_as_x')
        self.x_outer_y_add_O_kernel = culinalg_kernel.get_function('x_outer_y_add_O')
        self.X_router_Y_add_O_kernel = culinalg_kernel.get_function('X_router_Y_add_O')
        self.X_rdot_Y_kernel = culinalg_kernel.get_function('X_rdot_Y')
        
        self.index_to_array_kernel = culinalg_kernel.get_function('index_to_array')

        self._2d_block = (32, 32, 1)
        self._1d_block = (1024, 1, 1)
        self._3d_block = (16, 16, 4)
Esempio n. 19
0
def main():
    m = 64; k = 512; n = 400;
    #m = 2; k = 3; n = 4;
    handle = cublas.cublasCreate()
    _, narrays, batchsize = sys.argv
    narrays = int(narrays); batchsize = int(batchsize);
    
    cols = []; kernels = []; biases = [];
    pcols = []; pkernels = []; pbiases= []; #lists to stores pointers to gpu arrays
    kernel = np.float32((np.random.rand(m, k) -.5) * 2)
    kernel = np.float32(np.reshape(np.arange(0, m*k, 1), [m, k]))
    for i in range(narrays):
        col = np.float32((np.random.rand(k, n) - .5) * 2)
        #col = np.float32(np.reshape(np.arange(0, k*n, 1), [k, n]))
        bias = np.float32(np.zeros((m, n)))
        col_d = gpu.to_gpu(col)
        kernel_d = gpu.to_gpu(kernel)
        bias_d = gpu.to_gpu(bias)
        cols.append(col_d); kernels.append(kernel_d); biases.append(bias_d);
        pcols.append(col_d.ptr); pkernels.append(kernel_d.ptr); pbiases.append(bias_d.ptr);
    pcols = np.array(pcols); pkernels = np.array(pkernels); pbiases = np.array(pbiases); 
    pcols_d = gpu.to_gpu(pcols); pkernels_d = gpu.to_gpu(pkernels); pbiases_d = gpu.to_gpu(pbiases);
    
    for i in range(narrays):
        compute_sgemm(cols[i], kernels[i], biases[i], 0, handle);
    #zero out arrays for checking results
    #for i in range(narrays):
        #print biases[i]
    #    biases[i] -= biases[i]
    print "\n\n"
    for i in range((narrays+batchsize-1)/batchsize):
        start = i*batchsize
        compute_sgemm_batched(pcols_d[start:start+batchsize], pkernels_d[start:start+batchsize], pbiases_d[start:start+batchsize], m, k, n, 0, handle)
    #for i in range(narrays):
    #    print biases[i]
    cublas.cublasDestroy(handle)
Esempio n. 20
0
 def setUp(self):
     self.cublas_handle = cublas.cublasCreate()
Esempio n. 21
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pycuda.driver as drv
from nervanagpu import NervanaGPU
from pycuda.autoinit import context
from scikits.cuda import cublas
from ipdb import set_trace

print context.get_device().name()

handle = cublas.cublasCreate()

start, end = (drv.Event(), drv.Event())

def cublas_dot(A, B, C, repeat=1):

    lda = max(A.strides) // 4
    ldb = max(B.strides) // 4
    ldc = max(C.strides) // 4

    opA = 't' if A.is_trans else 'n'
    opB = 't' if B.is_trans else 'n'
    op  = opB + opA

    m = A.shape[0]
    n = B.shape[1]
Esempio n. 22
0
def gpu_computation(image, kernels, biases, max_sizes, soft_weights, soft_bias, batches, window_sizes):
    nbatches = len(batches)
    batchsize = len(batches[0])
    npixels = nbatches*batchsize
    layers = len(kernels)
    handle = cublas.cublasCreate()
    results = []
    result_ps = []
    pad = 0; stride = 1; 
    full_image_d = gpu.to_gpu(image)

    image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s = compute_dims(image, kernels, biases, max_sizes, batchsize, window_sizes, pad, stride)
    
    b_result = [];
    b_offsets_d = [];
    
    kernels_d = [];
    cols = []; col_ps = [];
    biases_d = [];
    sgemm_biases = []; sgemm_biases_ps = [];
    outputs = [];

    for layer_n, (bias, kernel, sgemm_dim, im_dim, out_dim, max_ksize, ksize, kchannels) in enumerate(zip(biases, kernels, sgemm_dims, image_dims, out_dims, max_sizes, ksizes, kchannels_s)):
        col = gpu.empty((batchsize, sgemm_dim[1], sgemm_dim[2]), np.float32) 
        cols.append(col)
        col_ps.append([col[idx, :, :].ptr for idx in range(batchsize)])
        
        #reuse the same kernels for every pixel
        kernel_d = gpu.to_gpu(kernel)
        kernel_d = kernel_d.reshape(kchannels, ksize*ksize*im_dim[2])
        kernels_d.append(kernel_d)

 
        #contain the actual data of the biases
        bias = bias.reshape(1, bias.shape[2], bias.shape[0]*bias.shape[1])
        batch_bias = np.tile(bias, (batchsize, 1, 1))
        batch_bias_d = gpu.to_gpu(batch_bias)
        biases_d.append(batch_bias_d)
        
        #scratch space to copy biases to and then write output of sgemm to
        sgemm_bias = gpu.empty(batch_bias.shape, np.float32)
        sgemm_biases.append(sgemm_bias)
        
        sgemm_biases_ps.append([sgemm_bias[idx, :, :].ptr for idx in range(batchsize)])

        #space for output of maxpool
        output = gpu.empty((batchsize, out_dim[2], out_dim[0], out_dim[1]), np.float32)
        outputs.append(output)

    #space for final output
    classes = gpu.empty(npixels, np.float32)
    soft_weights_d = gpu.to_gpu(soft_weights)
    soft_bias = soft_bias.reshape(1, soft_bias.shape[0])
    soft_bias_d = gpu.to_gpu(np.ascontiguousarray(np.reshape(np.tile(soft_bias, (batchsize, 1)), (2, batchsize))))
    soft_bias_scratch = gpu.empty((soft_bias_d.shape[0], soft_bias_d.shape[1]), np.float32)

    col_ps_d = gpu.to_gpu(np.array(col_ps))

    kernel_ps = map(lambda x: [x.ptr]*batchsize, kernels_d)
    kernel_ps_d = gpu.to_gpu(np.array(kernel_ps))

    sgemm_biases_ps_d = gpu.to_gpu(np.array(sgemm_biases_ps))

    for batch in batches:
        offsets = comp_offsets(batch, full_image_d)
        offsets_d = gpu.to_gpu(np.int32(np.array(offsets)))
        b_offsets_d.append(offsets_d);

        #space to hold final result of each layer
        result = gpu.empty((out_dims[layers-1][2], out_dims[layers-1][0], out_dims[layers-1][1]), np.float32)
        b_result.append(result)

    for batchn, (batch, offsets_d, result) in enumerate(zip(batches, b_offsets_d, b_result)):

        image_d = full_image_d
        for layer_n, (im_dim, col_dim, kdim, bias_dim, sgemm_dim, out_dim, ksize, kchannels, max_size) in enumerate(zip(image_dims, col_dims, kernel_dims, bias_dims, sgemm_dims, out_dims, ksizes, kchannels_s, max_sizes)):

            sgemm_bias = sgemm_biases[layer_n]
            cu.memcpy_dtod(sgemm_bias.ptr, biases_d[layer_n].ptr, sgemm_bias.nbytes)

            im2col_gpu.compute_im2col_batched(image_d, im_dim[0], im_dim[1], im_dim[2], np.int32(ksize), np.int32(pad), np.int32(stride), offsets_d, layer_n, batchsize, cols[layer_n])
            compute_sgemm_batched(col_ps_d[layer_n], kernel_ps_d[layer_n], sgemm_biases_ps_d[layer_n], handle, sgemm_dim[0], sgemm_dim[1], sgemm_dim[2])
            sgemm_bias = sgemm_bias.reshape(np.int32(batchsize), np.int32(kchannels), col_dim[0], col_dim[1])
            maxpool_gpu.compute_max_batched(sgemm_bias, outputs[layer_n], np.int32(max_size))
            image_d = outputs[layer_n]
        result = outputs[layers-1]
        result = result.reshape(result.shape[0], result.shape[1]*result.shape[2]*result.shape[3]) 
        cu.memcpy_dtod(soft_bias_scratch.ptr, soft_bias_d.ptr, soft_bias_d.nbytes)
        np_soft_weights = soft_weights_d.get()
        np_result = result.get()
        compute_sgemm(soft_weights_d, result, soft_bias_scratch, handle)
        
        offset = batchn*batchsize
        soft_max_in = soft_bias_scratch
        soft_max.compute_soft_max(soft_max_in, classes, offset)
        result_ps.append(result)
        
    cublas.cublasDestroy(handle)
    return classes
Esempio n. 23
0
 def create(self):
     if self.handle is None:
         self.handle = cublas.cublasCreate()
Esempio n. 24
0
 def create(self):
     if self.handle is None:
         self.handle = cublas.cublasCreate()
Esempio n. 25
0
    def __init__(self,
                 p,
                 A1,
                 A2,
                 l=None,
                 r=None,
                 left=False,
                 pseudo=True,
                 use_batch=False):
        assert not (pseudo and
                    (l is None
                     or r is None)), 'For pseudo-inverse l and r must be set!'

        self.use_batch = use_batch
        self.p = p
        self.left = left
        self.pseudo = pseudo
        self.D = A1[0].shape[1]
        self.shape = (self.D**2, self.D**2)
        self.dtype = A1[0].dtype

        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))

        self.l = l
        self.r = r
        self.lG = garr.to_gpu(sp.asarray(l))
        self.rG = garr.to_gpu(sp.asarray(r))

        self.out = garr.empty((self.D, self.D), dtype=self.dtype)
        self.out2 = garr.empty((self.D, self.D), dtype=self.dtype)
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)

        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
            self.out2_p = get_batch_ptrs([self.out2] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None
            self.out2_p = None

            self.ones = [
                garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))
            ]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [
                garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))
            ]

            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())

        self.hdl = cb.cublasCreate()
Esempio n. 26
0
 def setUp(self):
     self.cublas_handle = cublas.cublasCreate()
Esempio n. 27
0
 def setUp(self):
     np.random.seed(23)    # For reproducible tests.
     self.cublas_handle = cublas.cublasCreate()
Esempio n. 28
0
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh):
    GA = []
    for An in A:
        if An is None:
            GA.append(None)
        else:
            GAn = []
            for Ans in An:
                GAn.append(garr.to_gpu(Ans))
            GA.append(GAn)
    GA.append(None)

    Gl = []
    Gl_s = []
    Gl_si = []
    for n in range(len(l)):
        if l[n] is None:
            Gl.append(None)
            Gl_s.append(None)
            Gl_si.append(None)
        else:
            Gl.append(garr.to_gpu(sp.asarray(
                l[n])))  #TODO: Support special types...
            Gl_s.append(garr.to_gpu(sp.asarray(l_s[n])))
            Gl_si.append(garr.to_gpu(sp.asarray(l_si[n])))
    Gl.append(None)
    Gl_s.append(None)
    Gl_si.append(None)

    Gr = []
    Gr_s = []
    Gr_si = []
    for n in range(len(r)):
        if r[n] is None:
            Gr.append(None)
            Gr_s.append(None)
            Gr_si.append(None)
        else:
            Gr.append(garr.to_gpu(sp.asarray(
                r[n])))  #TODO: Support special types...
            Gr_s.append(garr.to_gpu(sp.asarray(r_s[n])))
            Gr_si.append(garr.to_gpu(sp.asarray(r_si[n])))
    Gr.append(None)
    Gr_s.append(None)
    Gr_si.append(None)

    GK = []
    for n in range(len(K)):
        if K[n] is None:
            GK.append(None)
        else:
            GK.append(garr.to_gpu(sp.asarray(K[n])))
    GK.append(None)

    GVsh = []
    for n in range(len(Vsh)):
        if Vsh[n] is None:
            GVsh.append(None)
        else:
            GVshn = []
            for s in range(Vsh[n].shape[0]):
                GVshn.append(garr.to_gpu(Vsh[n][s]))
            GVsh.append(GVshn)

    GC = []
    for n in range(len(C)):
        if C[n] is None:
            GC.append(None)
        else:
            GCn = []
            for s in range(C[n].shape[0]):
                GCns = []
                for t in range(C[n].shape[1]):
                    GCns.append(garr.to_gpu(C[n][s, t]))
                GCn.append(GCns)
            GC.append(GCn)
    GC.append(None)

    GCts = []
    for n in range(len(GC)):
        if GC[n] is None:
            GCts.append(None)
        else:
            GCtsn = []
            for t in range(len(GC[n])):
                GCtsns = []
                for s in range(len(GC[n][0])):
                    GCtsns.append(GC[n][s][t])
                GCtsn.append(GCtsns)
            GCts.append(GCtsn)

    hdl = cb.cublasCreate()

    num_strms = 10

    curr_stream = cb.cublasGetStream(hdl)

    sites_per_strm = max((N) // num_strms, 1)
    #print "sites_per_stream = ", sites_per_strm

    strms = []
    for i in range(N // sites_per_strm):
        strms.append(cd.Stream())

    GB = [None]
    for n in range(1, N + 1):
        if (n - 1) % sites_per_strm == 0:
            #print n
            #print "strm = ", (n - 1) // sites_per_strm
            cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle)
        if not Vsh[n] is None:
            if n > 1:
                Glm2 = Gl[n - 2]
            else:
                Glm2 = None

            Gx = calc_x_G(GK[n + 1],
                          GC[n],
                          GCts[n - 1],
                          Gr[n + 1],
                          Glm2,
                          GA[n - 1],
                          GA[n],
                          GA[n + 1],
                          Gl_s[n - 1],
                          Gl_si[n - 1],
                          Gr_s[n],
                          Gr_si[n],
                          GVsh[n],
                          handle=hdl)
            GBn = []
            for s in range(A[n].shape[0]):
                GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl)
                GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl)
                GBns = cla.dot(GBns, Gr_si[n], handle=hdl)
                GBn.append(GBns)
            GB.append(GBn)
        else:
            GB.append(None)

    cb.cublasSetStream(hdl, curr_stream)
    cb.cublasDestroy(hdl)

    B = [None]
    for n in range(1, N + 1):
        if GB[n] is None:
            B.append(None)
        else:
            Bn = sp.empty_like(A[n])
            for s in range(A[n].shape[0]):
                Bn[s] = GB[n][s].get()
            B.append(Bn)

    return B
Esempio n. 29
0
def calc_Bs(N, A, l, l_s, l_si, r, r_s, r_si, C, K, Vsh):
    GA = []
    for An in A:
        if An is None:
            GA.append(None)
        else:
            GAn = []
            for Ans in An:
                GAn.append(garr.to_gpu(Ans))
            GA.append(GAn)
    GA.append(None)
    
    Gl = []
    Gl_s = []
    Gl_si = []
    for n in range(len(l)):
        if l[n] is None:
            Gl.append(None)
            Gl_s.append(None)
            Gl_si.append(None)
        else:
            Gl.append(garr.to_gpu(sp.asarray(l[n]))) #TODO: Support special types...
            Gl_s.append(garr.to_gpu(sp.asarray(l_s[n])))
            Gl_si.append(garr.to_gpu(sp.asarray(l_si[n])))
    Gl.append(None)
    Gl_s.append(None)
    Gl_si.append(None)
        
    Gr = []
    Gr_s = []
    Gr_si = []
    for n in range(len(r)):
        if r[n] is None:
            Gr.append(None)
            Gr_s.append(None)
            Gr_si.append(None)
        else:
            Gr.append(garr.to_gpu(sp.asarray(r[n]))) #TODO: Support special types...
            Gr_s.append(garr.to_gpu(sp.asarray(r_s[n])))
            Gr_si.append(garr.to_gpu(sp.asarray(r_si[n])))
    Gr.append(None)
    Gr_s.append(None)
    Gr_si.append(None)

    GK = []
    for n in range(len(K)):
        if K[n] is None:
            GK.append(None)
        else:
            GK.append(garr.to_gpu(sp.asarray(K[n])))
    GK.append(None)
            
    GVsh = []
    for n in range(len(Vsh)):
        if Vsh[n] is None:
            GVsh.append(None)
        else:
            GVshn = []
            for s in range(Vsh[n].shape[0]):
                GVshn.append(garr.to_gpu(Vsh[n][s]))
            GVsh.append(GVshn)
    
    GC = []
    for n in range(len(C)):
        if C[n] is None:
            GC.append(None)
        else:
            GCn = []
            for s in range(C[n].shape[0]):
                GCns = []
                for t in range(C[n].shape[1]):
                    GCns.append(garr.to_gpu(C[n][s, t]))
                GCn.append(GCns)
            GC.append(GCn)
    GC.append(None)
    
    GCts = []
    for n in range(len(GC)):
        if GC[n] is None:
            GCts.append(None)
        else:
            GCtsn = []
            for t in range(len(GC[n])):
                GCtsns = []
                for s in range(len(GC[n][0])):
                    GCtsns.append(GC[n][s][t])
                GCtsn.append(GCtsns)
            GCts.append(GCtsn)
            
    hdl = cb.cublasCreate()
    
    num_strms = 10
    
    curr_stream = cb.cublasGetStream(hdl)
    
    sites_per_strm = max((N) // num_strms, 1)
    #print "sites_per_stream = ", sites_per_strm
    
    strms = []
    for i in range(N // sites_per_strm):
        strms.append(cd.Stream())
    
    GB = [None]
    for n in range(1, N + 1):
        if (n - 1) % sites_per_strm == 0:
            #print n
            #print "strm = ", (n - 1) // sites_per_strm
            cb.cublasSetStream(hdl, strms[(n - 1) // sites_per_strm].handle)
        if not Vsh[n] is None:
            if n > 1:
                Glm2 = Gl[n - 2]
            else:
                Glm2 = None
                
            Gx = calc_x_G(GK[n + 1], GC[n], GCts[n - 1], Gr[n + 1], Glm2, GA[n - 1], GA[n],
                          GA[n + 1], Gl_s[n - 1], Gl_si[n - 1], Gr_s[n], Gr_si[n], GVsh[n], handle=hdl)
            GBn = []
            for s in range(A[n].shape[0]):
                GBns = cla.dot(Gl_si[n - 1], Gx, handle=hdl) 
                GBns = cla.dot(GBns, GVsh[n][s], transb='C', handle=hdl)
                GBns = cla.dot(GBns, Gr_si[n], handle=hdl)
                GBn.append(GBns)
            GB.append(GBn)
        else:
            GB.append(None)
            
    cb.cublasSetStream(hdl, curr_stream)    
    cb.cublasDestroy(hdl)
    
    B = [None]
    for n in range(1, N + 1):
        if GB[n] is None:
            B.append(None)
        else:
            Bn = sp.empty_like(A[n])
            for s in range(A[n].shape[0]):
                Bn[s] = GB[n][s].get()
            B.append(Bn)
        
    return B
Esempio n. 30
0
def _get_cublas():
    return cublas.cublasCreate()
Esempio n. 31
0
        from .parallel import get_id_within_node
        gpuid = get_id_within_node()
        import pycuda.driver
        pycuda.driver.init()
        if gpuid >= pycuda.driver.Device.count():
            print '[' + MPI.Get_processor_name(
            ) + '] more processes than the GPU numbers!'
            #MPI.COMM_WORLD.Abort()
            raise
        gpu_device = pycuda.driver.Device(gpuid)
        gpu_context = gpu_device.make_context()
        gpu_initialized = True
    else:
        import pycuda.autoinit
        gpu_initialized = True
except:
    pass

try:
    from scikits.cuda import cublas
    import scikits.cuda.linalg as culinalg
    culinalg.init()
    cublas_handle = cublas.cublasCreate()
except:
    pass


def closeGPU():
    if gpu_context is not None:
        gpu_context.detach()
Esempio n. 32
0
from scikits.cuda import cublas as cubla
import libcudnn

cublas = cubla.cublasCreate()
cudnn = libcudnn.cudnnCreate()

print("CUDNN Version: %d" % libcudnn.cudnnGetVersion())
print("CUBLAS Version:", cubla.cublasGetVersion(cublas))