Ejemplo n.º 1
0
 def fit(self,X,Budget=None,W=None):
     self.X = cuda.to_device(X.astype(np.float64,order='F'))
     self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F')
     self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F')
     self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     if Budget is None:
         permutation = np.random.permutation(self.X.shape[0])
         self.permutation = cuda.to_device(permutation)
         initBudget(self.X,self.permutation,self.Budget)
     else:
         self.Budget = cuda.to_device(Budget.astype(np.float64,order='F'))
     self.calculateKB()
     self.calculateKX()
     if W is None:
         self.initW()
     else:
         self.W = cuda.to_device(W.astype(np.float64,order='F'))
     self.t = 0
     for i in xrange(self.epochs):
         print "Epoch " + str(i)
         samples,features = self.X.shape
         permutation = getPermutation(samples,self.miniBatchSize)
         self.permutation = cuda.to_device(permutation)
         for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize):
             loadBatch(self.kx,self.permutation,j,self.kxi)
             self.nextW()
             self.t += 1
     self.predictH()
Ejemplo n.º 2
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
Ejemplo n.º 3
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
Ejemplo n.º 4
0
 def __init__(self,budgetSize,latentTopics,miniBatchSize,epochs,Gamma,Lambda,Alpha,metric='rbf',sigma=1.0):
     """
     OKMF class: OKMF is a method to perform a matrix factorization in a 
     feature space.
     
     Parameters
     ----------
     
     budgetSize : int
         Budget size.
     latentTopics: int
         Latent topics.
     miniBatchSize : int
         Size of minibatch.
     epochs : int
         Number of epochs.
     Gamma : float
         Gamma parameter
     Lambda : float
         Lambda parameter
     Alpha: float
         Alpha parameter
     metric : string
         Type of kernel. Default rbf
     sigma : float
         RBF kernel sigma parameter. Default 1.0.
     """
     self.budgetSize = budgetSize
     self.latentTopics = latentTopics
     self.miniBatchSize = miniBatchSize
     self.epochs = epochs
     self.Gamma = Gamma
     self.Lambda = Lambda
     self.Alpha = Alpha
     self.metric = metric
     self.sigma = sigma
     self.W = None
     self.h = cuda.device_array((latentTopics,miniBatchSize),dtype=np.float64,order='F')
     self.KB = cuda.device_array((budgetSize,budgetSize),dtype=np.float64,order='F')
     self.kxi = cuda.device_array((budgetSize,miniBatchSize),dtype=np.float64,order='F')
     self.Blas = cublas.Blas()
     self.X = None
     self.Budget = None
     self.permutation = None
     self.kx = None
     self.Wkx = None
     self.H = None
     self.KBW = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F')
     self.KBWh = cuda.device_array((budgetSize,miniBatchSize),dtype=np.float64,order='F')
     self.KBWhh = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F')
     self.grad = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F')
     self.kxih = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F')
     self.WKBW = cuda.device_array((latentTopics,latentTopics),dtype=np.float64,order='F')
     self.Wkxi = cuda.device_array((latentTopics,miniBatchSize),dtype=np.float64,order='F')
     eyeAlpha = np.eye(latentTopics) * Alpha
     self.eyeAlpha = cuda.to_device(eyeAlpha.astype(np.float64,order='F'))
Ejemplo n.º 5
0
    def T(self, a, out=None):
        """Returns the transpose of a 2D array.

        Parameters
        ----------
        a : array-like
            Numpy or DeviceNDArray to transpose.
        out : DeviceNDArray (optional)
            Array to overwrite with result.
        """

        a, out_dtype = _check_array(a)
            
        if type(out) == cuda.cudadrv.devicearray.DeviceNDArray:
            pass
        elif out == None:
            pass
        else:
            raise NotImplementedError

        a_dim = a.shape
        if a.ndim == 2:
            if out is None:
                out = cuda.device_array((a_dim[1],a_dim[0]),dtype=out_dtype,order='F')
            elif out.shape[0] == a_dim[1] and out.shape[1] == a_dim[0]:
                pass
            else:
                raise NotImplementedError
        else:
            raise NotImplementedError

        self.blas.geam('T','T',a_dim[1],a_dim[0],1.,a,0.,a,out)

        return out
Ejemplo n.º 6
0
    def tanh(self, a, out=None):
        """Tanh of input.

        Parameters
        ----------
        a : array-like
            Array to rectify.
        """

        a, out_dtype = _check_array(a)
        a_dim = a.shape

        if type(out) == cuda.cudadrv.devicearray.DeviceNDArray:
            pass
        elif out is None:
            pass
        else:
            raise NotImplementedError
        if out is None:
            out = cuda.device_array(shape=a_dim, dtype=out_dtype, order='F')
        elif out.shape == a_dim:
            pass
        else:
            raise ValueError('matrices are not aligned')

        if a.ndim == 2:
            griddim2 = (int(ceil(a_dim[0]/self.blockdim2[0])),int(ceil(a_dim[1]/self.blockdim2[1])))
            tanh_m[griddim2, self.blockdim2, self.stream](a, out)
        elif a.ndim == 1:
            griddim = int(ceil(a_dim[0]/self.blockdim))
            tanh_v[griddim, self.blockdim, self.stream](a, out)
        else:
            raise NotImplementedError

        return out
Ejemplo n.º 7
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
Ejemplo n.º 8
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    step_cfg = step[gridsz, blksz, stream]
    
    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last, stream=stream)
        d_last = d_paths

    stream.synchronize()
Ejemplo n.º 9
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
Ejemplo n.º 10
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
Ejemplo n.º 11
0
def preScan(out_d, in_d, in_size):

    threads_per_block = (BLOCK_SIZE, 1)
    nBlocks = int(ceil(in_size / (2 * 1.0 * BLOCK_SIZE)))
    number_of_blocks = (nBlocks, 1)

    aux_d = cuda.device_array(nBlocks, dtype=np.uint32)
    aux_od = cuda.device_array(nBlocks, dtype=np.uint32)

    exclusiveScanGPU [number_of_blocks, threads_per_block] (aux_d, out_d, in_d, in_size)

    if nBlocks > 1:
        preScan(aux_od, aux_d, nBlocks)
    else:
        aux_od = aux_d

    exclusiveCombineGPU [number_of_blocks, threads_per_block] (out_d, aux_od, in_size)
Ejemplo n.º 12
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Ejemplo n.º 13
0
    def mean(self, a, out=None, axis=None):
        """Average array elements.

        Parameters
        ----------
        a : array-like
           Array to average.
        out : array-like
            Result will be stored in this array.
        axis : int
            1 or 0 for 2D arrays.
        """
        a, out_dtype = _check_array(a)

        a_dim = a.shape
        
        if a.ndim == 2:
            if axis is None:
                a_strides = a.strides
                d_flat_a = _cu_reshape(a, (np.prod(a_dim),), (a_strides[0],), out_dtype)
                out = self.blas.asum(d_flat_a)/float(np.prod(a_dim))
            elif axis == 0:
                if out is None:
                    out = cuda.device_array(a_dim[1], dtype=out_dtype, order='F')
                elif out.shape[0] == a_dim[1]:
                    pass
                else:
                    raise ValueError('matrices are not aligned')
                griddim = int(ceil(a_dim[1]/self.blockdim))
                mean_0[griddim, self.blockdim, self.stream](a, float(a_dim[0]), out)
            elif axis == 1:
                if out is None:
                    out = cuda.device_array(a_dim[0], dtype=out_dtype, order='F')
                elif out.shape[0] == a_dim[0]:
                    pass
                else:
                    raise ValueError('matrices are not aligned')
                griddim = int(ceil(a_dim[0]/self.blockdim))
                mean_1[griddim, self.blockdim, self.stream](a, float(a_dim[1]), out)
        elif a.ndim == 1:
            out = self.blas.asum(a)/float(np.prod(a_dim))
            pass
        else:
            raise NotImplementedError
        return out
Ejemplo n.º 14
0
def mtranspose(a):
    blockDim = (min(32,a.shape[0]),min(32,a.shape[1]))
    gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((a.shape[1] + blockDim[1]) - 1) / blockDim[1]))

    b = cuda.device_array((a.shape[1],a.shape[0]),dtype='float32')

    d_mtranspose[gridDim,blockDim](a,b)

    return b
Ejemplo n.º 15
0
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")    
    
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib.cublas import Blas  # @UnresolvedImport
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
            x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
            A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
Ejemplo n.º 16
0
def montecarlo_datamgmt(paths, dt, interest, volatility):
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * np.sqrt(dt)

    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)
    d_noises = cuda.device_array(paths.shape[0])

    d_curLast = cuda.to_device(paths[:,0]) # Copy first set of stock prices to the GPU
    d_curNext = cuda.device_array(paths.shape[0]) # Create an empty array to hold the next set of calculated prices

    for j in xrange(1, paths.shape[1]):   # for each time step
        # Generate gaussian noises for simulation
        prng.normal(d_noises, 0., 1.)
        # Call the GPU-acclereated step function to calculate the next set of prices
        d_curNext = step(d_curLast, dt, c0, c1, d_noises)
        # Copy calculated prices to host
        d_curNext.copy_to_host(paths[:,j])
        # Swap the prices so the "last" prices was the one we just copied 
        # to the host.
        d_curNext, d_curLast = d_curLast, d_curNext
Ejemplo n.º 17
0
def most_similar(a):
	assert a.shape[1] == d_vectors.shape[1], "Size Mismatch: (%i,%i), (%i,%i)" %(a.shape[0],a.shape[1],d_vectors.shape[0],d_vectors.shape[1])
	blockDim = (1024)
	gridDim = (((d_vectors.shape[0] + blockDim) - 1) / blockDim)

	val = cuda.device_array((1,d_vectors.shape[0]),dtype='float32')

	d_distances[gridDim,blockDim](a,d_vectors,val)

	_,idx = margmin(val)

	return inv_vocab[idx]
def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS - i - 1

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)
    temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    for bit_shift in range(0, 32):
        tk1 = time()
        #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift)
        preScan(out_d, in_d, NUM_ELEMENTS)
        tk2 = time()
        #print bit_shift, tk2 - tk1
        in_d = out_d
        out_d = temp_d
        temp_d = in_d

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()

    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = [NUM_ELEMENTS - i - 1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1
def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS - i - 1

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)
    temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    for bit_shift in range(0, 32):
        tk1 = time()
        #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift)
        preScan(out_d, in_d, NUM_ELEMENTS)
        tk2 = time()
        #print bit_shift, tk2 - tk1
        in_d = out_d
        out_d = temp_d
        temp_d = in_d

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()

    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = [NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1
Ejemplo n.º 20
0
    def flush(self, metric_opt, supp_opt):
        if not self.Vcs:
            # Nothing to do
            return metric_opt, supp_opt

        k = self.k
        V = self.V

        topk_list = []

        nodect = V.shape[0]
        numseg = len(self.Vcs)
        assert nodect
        assert numseg
        eachsize = nodect * numseg
        D = np.zeros(eachsize, dtype=np.float32)

        # Fill buffer for segmented sort
        for i, Vc in enumerate(self.Vcs):
            D[i * nodect:(i + 1) * nodect] = Vc[:, 0]

        # Prepare for GPU segmented sort
        dD = cuda.to_device(D)
        dI = cuda.device_array((numseg, nodect), dtype=np.uint32)

        blksz = 32
        init_indices[(divup(dI.shape[0], blksz),
                      divup(dI.shape[1], blksz)),
                     (blksz, blksz)](dI)

        if numseg == 1:
            segments = np.arange(1, dtype=np.int32)
        else:
            segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect

        segmented_sort(dD, dI, cuda.to_device(segments))

        for i in range(numseg):
            topk = dI[i, -k:].copy_to_host()
            topk_list.append(topk)

        # Reduce
        for topk in topk_list:
            # Assume A is huge
            metric = np.linalg.norm(V[topk, :]) ** 2
            if metric > metric_opt:
                metric_opt = metric
                supp_opt = topk

        # Clear all Vc
        self.Vcs.clear()
        return metric_opt, supp_opt
Ejemplo n.º 21
0
    def diag(self, a, out=None):
        """Creates vector from diagonal of matrix or
        matrix with diagonal from vector.

        Parameters
        ----------
        a : array-like
            Vector or array from which to take diagonal.
        out : array-like, optional
            Output array.
        """
        a, out_dtype = _check_array(a)

        a_dim = a.shape

        if a.ndim == 2:
            if out is None:
                out = cuda.device_array(shape=a_dim[0], dtype=out_dtype, order='F')
            elif out.shape[0] == a_dim[0] and out.ndim == 1:
                pass
            else:
                raise ValueError('matrices are not aligned')
            griddim = int(ceil(a_dim[0]/self.blockdim))
            diag2v[griddim, self.blockdim, self.stream](a, out)

        elif a.ndim == 1:
            if out is None:
                out = cuda.device_array(shape=(a_dim[0],a_dim[0]), dtype=out_dtype, order='F')
            elif out.shape == (a_dim[0], a_dim[0]):
                pass
            else:
                raise ValueError('matrices are not aligned')
            griddim2 = (int(ceil(a_dim[0]/self.blockdim2[0])), int(ceil(a_dim[0]/self.blockdim2[1])))
            diag2m[griddim2, self.blockdim2, self.stream](a, out)
        else:
            raise NotImplementedError
        
        return out
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
def mc_cuda(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # instantiate a CUDA stream for queueing async CUDA cmds
    stream = cuda.stream()
    # instantiate a cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # configure the kernel
    # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>>
    step_cfg = step_cuda[gridsz, blksz, stream]
    
    # transfer the initial prices
    d_last = cuda.to_device(paths[:, 0], stream=stream)
    for j in range(1, paths.shape[1]):
        # call cuRAND to populate d_normdist with gaussian noises
        prng.normal(d_normdist, mean=0, sigma=1)
        # setup memory for new prices
        # device_array_like is like empty_like for GPU
        d_paths = cuda.device_array_like(paths[:, j], stream=stream)
        # invoke step kernel asynchronously
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        # transfer memory back to the host
        d_paths.copy_to_host(paths[:, j], stream=stream)
        d_last = d_paths
    # wait for all GPU work to complete
    stream.synchronize()
Ejemplo n.º 25
0
    def const(self, shape, value, out=None):
        if type(shape) != tuple:
            shape = (shape,)
        assert len(shape) > 0
        if len(shape) > 2:
            raise NotImplementedError

        if out is None:
            out = cuda.device_array(shape=shape, dtype=np.float32, order='F')
        if out.shape != shape:
            raise ValueError('matrices are not aligned')

        out_dim = out.shape

        if out.ndim == 2:
            griddim2 = (int(ceil(out_dim[0]/self.blockdim2[0])),int(ceil(out_dim[1]/self.blockdim2[1])))
            const_m[griddim2, self.blockdim2, self.stream](out, value)
        elif out.ndim == 1:
            griddim = int(ceil(out_dim[0]/self.blockdim))
            const_v[griddim, self.blockdim, self.stream](out, value)
        else:
            raise NotImplementedError

        return out
Ejemplo n.º 26
0
    bitmap[x, y, 1] = int(g * 255.)
    bitmap[x, y, 2] = int(b * 255.)
    bitmap[x, y, 3] = 255


if __name__ == "__main__":

    start = timer()

    # Create a container for the pixel RGBA information of our image
    bitmap = np.zeros([DIM, DIM, 4], dtype=np.int16)

    # Copy to device memory
    d_bitmap = cuda.to_device(bitmap)
    # Create empty container for our Sphere data on device
    d_spheres = cuda.device_array(SPHERES, dtype=Sphere_t)

    # Create an empty container of spheres on host, and populate it
    # with some random data.
    temp_spheres = np.empty(SPHERES, dtype=Sphere_t)
    for i in xrange(SPHERES):
        temp_spheres[i]['r'] = rnd(1.0)
        temp_spheres[i]['g'] = rnd(1.0)
        temp_spheres[i]['b'] = rnd(1.0)
        temp_spheres[i]['x'] = rnd(DIM) - DIM / 2
        temp_spheres[i]['y'] = rnd(DIM) - DIM / 2
        temp_spheres[i]['z'] = rnd(DIM) - DIM / 2
        temp_spheres[i]['radius'] = rnd(100.0) + 20

        if VERBOSE:
            sph = temp_spheres[i]
Ejemplo n.º 27
0
 def _update(d):
     
     stream1 = cuda.stream()
     stream2 = cuda.stream()
     stream3 = cuda.stream()
     stream4 = cuda.stream()
     
     step = d['step']
     
     #print "Step: {}".format(step)
     
     """Calculate the pressure gradient. Two steps are needed for this."""
     # Calculate FFT of pressure.
     fft(d['field']['p'], d['temp']['fft_p'], stream=stream1)    
     
     stream1.synchronize()
     #print "FFT pressure: {}".format(d['temp']['fft_p'].copy_to_host())
     
     #pressure_exponent_x = exp(pressure_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!!
     #pressure_exponent_y = exp(pressure_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!!
     
             
     #print(d['spacing'].shape)
     #print(d['k_x'].shape)
     
     ex = cuda.device_array(shape=d['field']['p'].shape)
     
     print(d['k_x'].shape)
     print(d['spacing'].shape)
     print(d['k_x'].dtype)
     print(d['spacing'].dtype)
     print(pressure_gradient_exponent(d['k_x'], d['spacing']))
     
     ex = pressure_gradient_exponent(d['k_x'], d['spacing'])#, stream=stream1)
     ey = pressure_gradient_exponent(d['k_y'], d['spacing'])#, stream=stream2)
     
     pressure_exponent_x = exp(ex, stream=stream1) # This is a constant!!
     pressure_exponent_y = exp(ey, stream=stream2) # This is a constant!!
     
     
     stream1.synchronize()
     stream2.synchronize()
     
     #print ( to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x) ).copy_to_host()
     
     """Calculate the velocity gradient."""
     ifft(to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x, stream=stream1), d['temp']['d_p_d_x'], stream=stream1)
     ifft(to_gradient(d['temp']['fft_p'], d['k_y'], d['kappa'], pressure_exponent_y, stream=stream2), d['temp']['d_p_d_y'], stream=stream2) 
     
     #print "Pressure gradient x: {}".format( d['temp']['d_p_d_x'].copy_to_host() )
     #print "Pressure gradient y: {}".format( d['temp']['d_p_d_y'].copy_to_host() )
     
     """Calculate the velocity."""
     d['field']['v_x'] = velocity_with_pml(d['field']['v_x'], d['temp']['d_p_d_x'], d['timestep'], d['density'], d['abs_exp']['x'], d['source']['v']['x'][step], stream=stream1)
     d['field']['v_y'] = velocity_with_pml(d['field']['v_y'], d['temp']['d_p_d_y'], d['timestep'], d['density'], d['abs_exp']['y'], d['source']['v']['y'][step], stream=stream2)
 
 
     stream1.synchronize()
     stream2.synchronize()
     
     """Fourier transform of the velocity."""
     fft(d['field']['v_x'], d['temp']['fft_v_x'], stream=stream1)
     fft(d['field']['v_y'], d['temp']['fft_v_y'], stream=stream2)
     
     stream1.synchronize()
     stream2.synchronize()
     
     
     #print d['temp']['fft_v_y'].copy_to_host()
     #print "Velocity x: {}".format(d['field']['v_x'].copy_to_host())
     #print "Velocity y: {}".format(d['field']['v_y'].copy_to_host())
     
     #print "Source: {}".format(d['source']['p'][step].copy_to_host())
     
     #print "Source: {}".format(d['source']['p'])
     
     
     #print "Velocity exponent y: {}".format(velocity_exponent_y.copy_to_host())
     
     stream1.synchronize()
     stream2.synchronize()
     
     #stream3.synchronize()
     #stream4.synchronize()
     
     velocity_exponent_x = exp(velocity_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!!
     velocity_exponent_y = exp(velocity_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!!
     
     
     ifft(to_gradient(d['temp']['fft_v_x'], d['k_x'], d['kappa'], velocity_exponent_x, stream=stream1), d['temp']['d_v_d_x'], stream=stream1)
     ifft(to_gradient(d['temp']['fft_v_y'], d['k_y'], d['kappa'], velocity_exponent_y, stream=stream2), d['temp']['d_v_d_y'], stream=stream2)
 
     """And finally the pressure."""
     
     #print len([ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ])
     #pressure_with_pml(  d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step]  )
     #for i in [ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ]:
         #print i , i.shape
         #print i.copy_to_host()
         #try:
             #print i.dtype
         #except AttributeError:
             #print 'None'
     
     stream1.synchronize()
     stream2.synchronize()
     
     #print "Velocity gradient x: {}".format(d['temp']['d_v_d_x'].copy_to_host())
     #print "Velocity gradient y: {}".format(d['temp']['d_v_d_y'].copy_to_host())
     
     #print "Pressure x previous: {}".format(d['temp']['p_x'].copy_to_host())
     #print "Pressure y previous: {}".format(d['temp']['p_y'].copy_to_host())
 
     #print "Abs exp x: {}".format( d['abs_exp']['x'].copy_to_host())
     #print "Abs exp y: {}".format( d['abs_exp']['y'].copy_to_host())
     
     d['temp']['p_x'] = pressure_with_pml(d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step], stream=stream1)
     d['temp']['p_y'] = pressure_with_pml(d['temp']['p_y'], d['temp']['d_v_d_y'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['y'], d['source']['p'][step], stream=stream2)
 
     stream1.synchronize()
     stream2.synchronize()
     
     #try:
         #print "Source p: {}".format(d['source']['p'][step].copy_to_host())
     #except AttributeError:
         #print "Source p: {}".format(d['source']['p'][step])
         
     #print "Pressure x: {}".format(d['temp']['p_x'].copy_to_host())
     #print "Pressure y: {}".format(d['temp']['p_y'].copy_to_host())
 
     d['field']['p'] = add(d['temp']['p_x'], d['temp']['p_y'], stream=stream3)
     
     #stream3.synchronize()
     #print "Pressure total: {}".format(d['field']['p'].copy_to_host())
     
     
     stream1.synchronize()
     stream2.synchronize()
     stream3.synchronize()
     
     return d
Ejemplo n.º 28
0
def test_apriori():

    output_file = open("apriori_out.txt", "w")

    offsets, transactions, num_transactions, num_elements = readFile(
        "syncthetic_data.txt")
    print "Offset = ", offsets[:num_transactions]
    print "transactions = ", transactions[:num_elements]
    print "Num transactions = ", num_transactions
    print "Num elements = ", num_elements
    min_support = MIN_SUPPORT

    # to find number of max digits required to represent that many number of unique items

    power = 1
    while MAX_UNIQUE_ITEMS / (10**power) != 0:
        power += 1

    print "Power = ", power

    t = [item for item in transactions.tolist()]

    if num_elements > NUM_ELEMENTS:
        print "Error: Elements exceeding NUM_ELEMENTS. Exiting..."
        sys.exit(12)

    input_h = np.array(t, dtype=np.int32)
    print "Input transactions = ", list(input_h)
    print "Size of transactions = ", input_h.size
    ci_h = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.int32)
    li_h = np.empty(MAX_UNIQUE_ITEMS, dtype=np.int32)

    input_d = cuda.to_device(input_h)
    ci_d = cuda.to_device(ci_h)
    li_d = cuda.device_array(MAX_UNIQUE_ITEMS, dtype=np.int32)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))),
                        1)  #((NUM_ELEMENTS / threads_per_block[0]) + 1, 1)

    histogramGPU[number_of_blocks, threads_per_block](input_d, ci_d,
                                                      num_elements)
    #cuda.synchronize()

    ci_d.copy_to_host(ci_h)
    print "Ci_H Histogram result = ", ci_h  # support count for each item

    number_of_blocks = (int(
        ceil(MAX_UNIQUE_ITEMS / (1.0 * threads_per_block[0]))), 1)
    pruneGPU[number_of_blocks, threads_per_block](ci_d, MAX_UNIQUE_ITEMS,
                                                  min_support)
    cuda.synchronize()

    ci_d.copy_to_host(ci_h)
    print "Keys = ", [i for i in range(0, len(ci_h))]
    print "Ci_H Pruning result = ", ci_h  # support count for each item

    # calculate concise list of items satisfying min support
    l1_patterns = {}

    k = 0  # number of items whose sup_count > min_support
    for j in range(0, len(ci_h)):
        if ci_h[j] != 0:
            li_h[k] = j
            l1_patterns[(j, )] = ci_h[j]
            k += 1

    print "\n=======================================================\n"
    print "L1 = ", list(li_h)[:k]  #items whose support_count > min_support
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(l1_patterns, 1))

    print "K(num_items_with_good_sup_count = ", k

    #k = 102
    ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32)
    ci_d = cuda.to_device(ci_h)

    #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32)
    #tli_h = np.array([i for i in range(1, k + 1)], dtype=np.int32)

    t1 = time()
    li_d = cuda.to_device(li_h)
    number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1)
    print "Self join 2 number of blocks = ", number_of_blocks
    print "K = ", k
    print "Ci_H size = ", ci_h.size
    print "LI_H size = ", li_h.size
    selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power)
    cuda.synchronize()
    li_d.copy_to_host(li_h)
    ci_d.copy_to_host(ci_h)
    t2 = time()

    #sys.exit(0)
    # f = open('join.txt', 'w')
    #
    # for i in range(0, k):
    #     line = ""
    #     for j in range(0, k):
    #         line += str(ci_h[k * i + j]) + " "
    #     f.write(line + "\n")
    #
    # f.close()
    #ci_h = ci_h.reshape(k, k)

    print "Initial Mask = ", ci_h.reshape(k, k)

    print "Self joining time = ", (t2 - t1)

    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)

    #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)
    number_of_blocks = (int(
        ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1)
    print "Num blocks for findFrequency = ", number_of_blocks

    print "Num transactions = ", num_transactions
    print "Num patterns = ", k
    print "index = ", list(li_h)[:k]
    findFrequencyGPU[number_of_blocks,
                     threads_per_block](d_transactions, d_offsets,
                                        num_transactions, num_elements, li_d,
                                        ci_d, k)
    cuda.synchronize()
    ci_d.copy_to_host(ci_h)
    print "Final Mask = ", ci_h.reshape(k, k)
    d_transactions.copy_to_host(transactions)

    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))),
                        (int(ceil(k / (1.0 * threads_per_block[0])))))

    pruneMultipleGPU[number_of_blocks, threads_per_block](
        ci_d, k, min_support)  # prunes according to min_support

    ci_d.copy_to_host(ci_h)
    print "Outer Mask = ", ci_h.reshape(k, k)

    ci_hn = np.zeros(k, dtype=np.int32)
    ci_dn = cuda.to_device(ci_hn)

    combinationsAvailable[threads_per_block, number_of_blocks](
        ci_d, ci_dn, k)  #Number of possible patterns in each row

    ci_dn.copy_to_host(ci_hn)

    print "Ci_hn = ", list(ci_hn)

    ci_hnx = np.empty(k, dtype=np.int32)
    ci_dnx = cuda.to_device(ci_hnx)

    preScan(ci_dnx, ci_dn, k)  # Prefix sum on patterns in each row

    ci_dnx.copy_to_host(ci_hnx)
    num_patterns = ci_hnx[-1]
    print "Ci_hnx = ", list(ci_hnx)

    sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32)
    sparseM_d = cuda.to_device(sparseM_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1)

    convert2Sparse[threads_per_block,
                   number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k)

    sparseM_d.copy_to_host(sparseM_h)

    # sparseM_h = sparseM_h.reshape(3, num_patterns)
    print sparseM_h.reshape(3, num_patterns)

    patterns = {}
    for i in range(0, num_patterns):
        item1 = sparseM_h[i]
        item2 = sparseM_h[i + num_patterns]
        support = sparseM_h[i + 2 * num_patterns]
        patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support

    print "\n=======================================================\n"
    print "L2 = ", patterns
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(patterns, 2))

    new_modulo_map = {}
    index_id = 1

    actual_pattern_items = []
    index_items_lookup = []

    #patterns = {(2, 3, 5) : 1, (2, 3, 6) : 1, (2, 3, 7) : 1, (2, 4, 5) : 1, (2, 4, 7) : 1, (3, 5, 7) : 1}
    for pattern in sorted(patterns.keys()):
        if pattern[:-1] not in new_modulo_map:
            new_modulo_map[pattern[:-1]] = index_id
            prev_len = len(actual_pattern_items)
            pattern_len = len(pattern[:-1])
            actual_pattern_items += pattern[:-1]
            index_items_lookup += [index_id, prev_len, pattern_len]
            index_id += 1

        if (pattern[-1], ) not in new_modulo_map:
            new_modulo_map[(pattern[-1], )] = index_id
            prev_len = len(actual_pattern_items)
            pattern_len = len([pattern[-1]])
            actual_pattern_items += [pattern[-1]]
            index_items_lookup += [index_id, prev_len, pattern_len]
            index_id += 1

    #print "Actual pattern items = ", actual_pattern_items
    #print "Index lookup = ", index_items_lookup
    print new_modulo_map

    new_patterns = []
    for pattern in patterns:
        new_patterns.append(
            (new_modulo_map[pattern[:-1]], new_modulo_map[(pattern[-1], )]))
    print new_patterns

    new_new_pattern = []
    for pattern in new_patterns:
        new_new_pattern.append(pattern[0] * 10**power + pattern[1])

    new_new_pattern.sort()
    print new_new_pattern

    k = len(new_new_pattern)

    li_h = np.array(new_new_pattern, dtype=np.int32)

    ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32)
    ci_d = cuda.to_device(ci_h)

    #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32)

    t1 = time()
    li_d = cuda.to_device(li_h)
    number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1)
    selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power)

    li_d.copy_to_host(li_h)
    ci_d.copy_to_host(ci_h)

    api_h = np.array(actual_pattern_items, dtype=np.int32)
    iil_h = np.array(index_items_lookup, dtype=np.int32)

    api_d = cuda.to_device(api_h)
    iil_d = cuda.to_device(iil_h)

    print "Api_h = ", list(api_h), " Size = ", api_h.size
    print "IIL_H = ", list(iil_h), " Size = ", iil_h.size
    t2 = time()
    print "LI_H = ", li_h
    print "Initial Mask = ", ci_h.reshape(k, k)

    #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)
    number_of_blocks = (int(
        ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1)

    print "Num transactions = ", num_transactions
    print "Num patterns = ", k
    print "index = ", li_h
    print "Size of api_d = ", api_h.size
    print "Size of iil_h = ", iil_h.size
    findHigherPatternFrequencyGPU[number_of_blocks,
                                  threads_per_block](d_transactions, d_offsets,
                                                     num_transactions,
                                                     num_elements, li_d, ci_d,
                                                     k, api_d, iil_d, power,
                                                     api_h.size, iil_h.size)
    cuda.synchronize()
    ci_d.copy_to_host(ci_h)

    print "Final Mask = ", ci_h.reshape(k, k)
    #d_transactions.copy_to_host(transactions)

    #print transactions[:num_elements]

    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))),
                        (int(ceil(k / (1.0 * threads_per_block[0])))))

    pruneMultipleGPU[number_of_blocks, threads_per_block](ci_d, k, min_support)

    ci_d.copy_to_host(ci_h)
    print "Outer Mask = ", ci_h.reshape(k, k)
    print "K = ", k

    ci_hn = np.zeros(k, dtype=np.int32)
    ci_dn = cuda.to_device(ci_hn)

    combinationsAvailable[threads_per_block, number_of_blocks](ci_d, ci_dn, k)

    ci_dn.copy_to_host(ci_hn)

    print "Ci_hn = ", list(ci_hn)

    ci_hnx = np.empty(k, dtype=np.int32)
    ci_dnx = cuda.to_device(ci_hnx)

    preScan(ci_dnx, ci_dn, k)

    ci_dnx.copy_to_host(ci_hnx)
    num_patterns = ci_hnx[-1]
    print list(ci_hnx)

    sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32)
    sparseM_d = cuda.to_device(sparseM_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1)
    print "K = ", k

    convert2Sparse[threads_per_block,
                   number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k)

    sparseM_d.copy_to_host(sparseM_h)

    # sparseM_h = sparseM_h.reshape(3, num_patterns)
    print sparseM_h.reshape(3, num_patterns)

    patterns = {}
    for i in range(0, num_patterns):
        item1 = sparseM_h[i]
        item2 = sparseM_h[i + num_patterns]
        support = sparseM_h[i + 2 * num_patterns]
        patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support
    print patterns

    actual_patterns = {}

    for pattern in patterns:
        v_common_pat = pattern[0] / (10**power)
        vitem1 = pattern[0] % (10**power)
        vitem2 = pattern[1] % (10**power)

        item1 = actual_pattern_items[index_items_lookup[(vitem1 - 1) * 3 + 1]]
        item2 = actual_pattern_items[index_items_lookup[(vitem2 - 1) * 3 + 1]]

        common_pat_start = index_items_lookup[(v_common_pat - 1) * 3 + 1]
        common_pat_length = index_items_lookup[(v_common_pat - 1) * 3 + 2]
        common_pat_end = common_pat_start + common_pat_length

        common_pattern = actual_pattern_items[common_pat_start:common_pat_end]

        pattern_key = tuple(common_pattern) + tuple(sorted([item1, item2]))
        actual_patterns[pattern_key] = patterns[pattern]

    print "\n=======================================================\n"
    print "L3 = ", actual_patterns
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(actual_patterns, 3))

    output_file.close()
Ejemplo n.º 29
0
    def backward(dY, cache, g_WLSTM):

        Wd = cache['Wd']
        Hout = cache['Hout']
        IFOG = cache['IFOG']
        IFOGf = cache['IFOGf']
        C = cache['C']
        Hin = cache['Hin']
        g_Hin = cuda.to_device(np.asfortranarray(Hin.T))
        WLSTM = cache['WLSTM']
        X = cache['X']
        tanhC_version = cache['tanhC_version']
        drop_prob_encoder = cache['drop_prob_encoder']
        drop_prob_decoder = cache['drop_prob_decoder']
        n, d = Hout.shape

        # we have to add back a row of zeros, since in the forward pass
        # this information was not used. See NOTE1 above.
        dY = np.row_stack([np.zeros(dY.shape[1]), dY])

        # backprop the decoder
        dWd = Hout.transpose().dot(dY)
        dbd = np.sum(dY, axis=0, keepdims=True)
        dHout = dY.dot(Wd.transpose())

        # backprop dropout, if it was applied
        if drop_prob_decoder > 0:
            dHout *= cache['U2']

        # backprop the LSTM
        dIFOG = np.array(np.zeros(IFOG.shape), order='F')
        dIFOGf = np.zeros(IFOGf.shape)
        dWLSTMCp = np.array(np.zeros(WLSTM.shape), order='F')
        dWLSTM = cuda.device_array(dWLSTMCp.shape, order='F')
        dWLSTM.copy_to_device(dWLSTMCp)
        dHin = np.array(np.zeros((1, Hin.shape[1])), order='F')
        g_dHin = cuda.device_array((1, Hin.shape[1]), order='F')
        dC = np.zeros(C.shape)
        dX = np.zeros(X.shape)
        for t in reversed(xrange(n)):

            if tanhC_version:
                tanhCt = np.tanh(C[t])  # recompute this here
                dIFOGf[t, 2 * d:3 * d] = tanhCt * dHout[t]
                # backprop tanh non-linearity first then continue backprop
                dC[t] += (1 - tanhCt**2) * (IFOGf[t, 2 * d:3 * d] * dHout[t])
            else:
                dIFOGf[t, 2 * d:3 * d] = C[t] * dHout[t]
                dC[t] += IFOGf[t, 2 * d:3 * d] * dHout[t]

            if t > 0:
                dIFOGf[t, d:2 * d] = C[t - 1] * dC[t]
                dC[t - 1] += IFOGf[t, d:2 * d] * dC[t]
            dIFOGf[t, :d] = IFOGf[t, 3 * d:] * dC[t]
            dIFOGf[t, 3 * d:] = IFOGf[t, :d] * dC[t]

            # backprop activation functions
            dIFOG[t, 3 * d:] = (1 - IFOGf[t, 3 * d:]**2) * dIFOGf[t, 3 * d:]
            y = IFOGf[t, :3 * d]
            dIFOG[t, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :3 * d]

            # backprop matrix multiply
            #dWLSTM += np.outer(Hin[t], dIFOG[t])
            #dHin[t] = dIFOG[t].dot(WLSTM.transpose())
            g_dIFOG = cuda.to_device(dIFOG[t:t + 1])
            g_dHin, dWLSTM = backMultSubroutine(g_Hin[:, t:t + 1], g_WLSTM,
                                                g_dIFOG, dWLSTM, g_dHin)
            g_dHin.copy_to_host(dHin)
            # backprop the identity transforms into Hin
            dX[t] = dHin[0, 1:1 + d]
            if t > 0:
                dHout[t - 1] += dHin[0, 1 + d:]

        if drop_prob_encoder > 0:  # backprop encoder dropout
            dX *= cache['U']

        dWLSTM.copy_to_host(dWLSTMCp)
        return {
            'WLSTM': dWLSTMCp,
            'Wd': dWd,
            'bd': dbd,
            'dXi': dX[0, :],
            'dXs': dX[1:, :]
        }
Ejemplo n.º 30
0
def main(*args):
    OPT_N = 4000000
    iterations = 10

    if len(args) >= 2:
        iterations = int(args[0])

    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1

    # Use cuRand to generate random numbers directyl on the gpu
    # to avoid memory transfers.
    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)

    time0 = time.time()

    # malloc
    d_stockPrice = cuda.device_array(shape=(OPT_N), dtype=np.float32)
    d_optionStrike = cuda.device_array(shape=(OPT_N), dtype=np.float32)
    d_optionYears = cuda.device_array(shape=(OPT_N), dtype=np.float32)

    # Base distribution
    prng.uniform(d_stockPrice)
    prng.uniform(d_optionStrike)
    prng.uniform(d_optionYears)

    stream = cuda.stream()

    cfg_distribute = c_distribute[griddim, blockdim, stream]

    cfg_distribute(d_stockPrice, 5.0, 30.0)
    cfg_distribute(d_optionStrike, 1.0, 100.0)
    cfg_distribute(d_optionYears, 0.25, 10.)

    stream.synchronize()

    callResultNumbapro = np.zeros(OPT_N)
    putResultNumbapro = -np.ones(OPT_N)

    d_callResult = cuda.to_device(callResultNumbapro, stream)
    d_putResult = cuda.to_device(putResultNumbapro, stream)

    time1 = time.time()

    # Preconfigure the kernel as it's called multiple times in a loop.
    cfg_black_scholes_cuda = black_scholes_cuda[griddim, blockdim, stream]

    for i in range(iterations):
        cfg_black_scholes_cuda(d_callResult, d_putResult, d_stockPrice,
                               d_optionStrike, d_optionYears, RISKFREE,
                               VOLATILITY)

        d_callResult.to_host(stream)
        d_putResult.to_host(stream)

        stream.synchronize()

    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)

    print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))
Ejemplo n.º 31
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2

    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [
        int(math.ceil(float(partlen) / blksz)) for partlen in partlens
    ]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [
        curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist
    ]

    # Allocate device side array
    d_normlist = [
        cuda.device_array(partlen, dtype=np.double, stream=strm)
        for partlen, strm in zip(partlens, strmlist)
    ]

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [
        cu_step[gridsz, blksz, strm]
        for gridsz, strm in zip(gridszlist, strmlist)
    ]

    d_lastlist = [
        cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
        for (s, e), strm in zip(partitions, strmlist)
    ]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [
            cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm))
            for (s, e), strm in zip(partitions, strmlist)
        ]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist,
                                            d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
  def backward(dY, cache,g_WLSTM):

    Wd = cache['Wd']
    Hout = cache['Hout']
    IFOG = cache['IFOG']
    IFOGf = cache['IFOGf']
    C = cache['C']
    Hin = cache['Hin']
    g_Hin = cuda.to_device(np.asfortranarray(Hin.T))
    WLSTM = cache['WLSTM']
    X = cache['X']
    tanhC_version = cache['tanhC_version']
    drop_prob_encoder = cache['drop_prob_encoder']
    drop_prob_decoder = cache['drop_prob_decoder']
    n,d = Hout.shape

    # we have to add back a row of zeros, since in the forward pass
    # this information was not used. See NOTE1 above.
    dY = np.row_stack([np.zeros(dY.shape[1]), dY])

    # backprop the decoder
    dWd = Hout.transpose().dot(dY)
    dbd = np.sum(dY, axis=0, keepdims = True)
    dHout = dY.dot(Wd.transpose())

    # backprop dropout, if it was applied
    if drop_prob_decoder > 0:
      dHout *= cache['U2']

    # backprop the LSTM
    dIFOG = np.array(np.zeros(IFOG.shape),order='F')
    dIFOGf = np.zeros(IFOGf.shape)
    dWLSTMCp = np.array(np.zeros(WLSTM.shape),order='F')
    dWLSTM = cuda.device_array(dWLSTMCp.shape,order='F')
    dWLSTM.copy_to_device(dWLSTMCp)
    dHin = np.array(np.zeros((1,Hin.shape[1])),order='F')
    g_dHin = cuda.device_array((1,Hin.shape[1]),order='F')
    dC = np.zeros(C.shape)
    dX = np.zeros(X.shape)
    for t in reversed(xrange(n)):

      if tanhC_version:
        tanhCt = np.tanh(C[t]) # recompute this here
        dIFOGf[t,2*d:3*d] = tanhCt * dHout[t]
        # backprop tanh non-linearity first then continue backprop
        dC[t] += (1-tanhCt**2) * (IFOGf[t,2*d:3*d] * dHout[t])
      else:
        dIFOGf[t,2*d:3*d] = C[t] * dHout[t]
        dC[t] += IFOGf[t,2*d:3*d] * dHout[t]

      if t > 0:
        dIFOGf[t,d:2*d] = C[t-1] * dC[t]
        dC[t-1] += IFOGf[t,d:2*d] * dC[t]
      dIFOGf[t,:d] = IFOGf[t, 3*d:] * dC[t]
      dIFOGf[t, 3*d:] = IFOGf[t,:d] * dC[t]
      
      # backprop activation functions
      dIFOG[t,3*d:] = (1 - IFOGf[t, 3*d:] ** 2) * dIFOGf[t,3*d:]
      y = IFOGf[t,:3*d]
      dIFOG[t,:3*d] = (y*(1.0-y)) * dIFOGf[t,:3*d]

      # backprop matrix multiply
      #dWLSTM += np.outer(Hin[t], dIFOG[t])
      #dHin[t] = dIFOG[t].dot(WLSTM.transpose())
      g_dIFOG = cuda.to_device(dIFOG[t:t+1])
      g_dHin, dWLSTM = backMultSubroutine(g_Hin[:,t:t+1],g_WLSTM,g_dIFOG,dWLSTM,g_dHin)
      g_dHin.copy_to_host(dHin)
      # backprop the identity transforms into Hin
      dX[t] = dHin[0,1:1+d]
      if t > 0:
        dHout[t-1] += dHin[0,1+d:]

    if drop_prob_encoder > 0: # backprop encoder dropout
      dX *= cache['U']
        
    dWLSTM.copy_to_host(dWLSTMCp)
    return { 'WLSTM': dWLSTMCp, 'Wd': dWd, 'bd': dbd, 'dXi': dX[0,:], 'dXs': dX[1:,:] }
Ejemplo n.º 33
0
    def add(self, a, b, out = None, alpha = 1., beta = 1.):
        """Pointwise addition of two scalars, 1D, or 2D arrays.

        Behaves like numpy array in terms of broadcasting.

        Parameters
        ----------
        a : array-like
            Array to add.
        b : array-like
            Array to add.
        out : DeviceNDArray (optional)
            Result will overwrite out if given.
        alpha : float (optional)
            Scales a before addition.
        beta : float
            Scales b before addition.
        """

        b, out_dtype = _check_array(b)
        a, out_dtype = _check_array(a)

        if type(out) == cuda.cudadrv.devicearray.DeviceNDArray:
            pass
        elif out is None:
            pass
        else:
            raise NotImplementedError

        a_dim = a.shape
        b_dim = b.shape

        # Matrix-matrix addition
        if a.ndim == 2 and b.ndim == 2:
            # Full-size matricies
            if a_dim == b_dim:
                if out is None:
                    out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F')
                elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]:
                    pass
                else:
                    raise ValueError('matrices are not aligned')

                self.blas.geam('N', 'N', a_dim[0], a_dim[1], alpha, a, beta, b, out)
            # np.newaxis matrices
            elif a_dim[0] == b_dim[0] and b_dim[1] == 1:
                if out is None:
                    out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F')
                elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]:
                    pass
                else:
                    raise ValueError('matrices are not aligned')
                blockdim = (32,32)
                griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[1])))
                if alpha != 1. or beta != 1.:
                    m_mn_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
                else:
                    m_mn_add_pointwise[griddim,blockdim, self.stream](a,b,out)
            elif a_dim[1] == b_dim[1] and b_dim[0] == 1:
                if out is None:
                    out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F')
                elif out.shape == a_dim:
                    pass
                else:
                    raise ValueError('matrices are not aligned')
                blockdim = (32,32)
                griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[1])))
                if alpha != 1. or beta != 1.:
                    m_nm_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
                else:
                    m_nm_add_pointwise[griddim,blockdim, self.stream](a,b,out)
            elif b_dim[0] == a_dim[0] and a_dim[1] == 1:
                if out is None:
                    out = cuda.device_array((b_dim[0], b_dim[1]), dtype=out_dtype, order='F')
                elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]:
                    pass
                else:
                    raise ValueError('matrices are not aligned')
                blockdim = (32,32)
                griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[1])))
                if alpha != 1. or beta != 1.:
                    m_mn_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out)
                else:
                    m_mn_add_pointwise[griddim,blockdim, self.stream](b,a,out)
            elif b_dim[1] == a_dim[1] and a_dim[0] == 1:
                if out is None:
                    out = cuda.device_array((b_dim[0], b_dim[1]), dtype=out_dtype, order='F')
                elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]:
                    pass
                else:
                    raise ValueError('matrices are not aligned')
                blockdim = (32,32)
                griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[1])))
                if alpha != 1. or beta != 1.:
                    m_nm_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out)
                else:
                    m_nm_add_pointwise[griddim,blockdim, self.stream](b,a,out)
            else:
                raise ValueError('matrices are not aligned')
        # Vector-vector addition
        elif a.ndim == 1 and b.ndim == 1:
            if a_dim[0] != b_dim[0]:
                raise ValueError('matricies not aligned')
            if out is None:
                out = cuda.device_array(a_dim[0], dtype=out_dtype, order='F')
            elif out.shape[0] == a_dim[0]:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = 32
            griddim = int(ceil(a_dim[0]/blockdim))
            if alpha != 1. or beta != 1.:
                vsadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
            else:
                vadd_pointwise[griddim,blockdim, self.stream](a,b,out)
        # Matrix-scalar addition
        elif a.ndim == 2 and b.ndim == 0:
            if out is None:
                out = cuda.device_array(a_dim, dtype=out_dtype, order='F')
            elif out.shape == a_dim:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = (32,32)
            griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[0])))
            if alpha != 1. or beta != 1.:
                ms_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
            else:
                ms_add_pointwise[griddim,blockdim, self.stream](a,b,out)
        # Scalar-matrix addition
        elif a.ndim == 0 and b.ndim == 2:
            if out is None:
                out = cuda.device_array(b_dim, dtype=out_dtype, order='F')
            elif out.shape == b_dim:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = (32,32)
            griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[0])))
            if alpha != 1. or beta != 1.:
                ms_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out)
            else:
                ms_add_pointwise[griddim,blockdim, self.stream](b,a,out)
        # Vector-scalar addition
        elif a.ndim == 1 and b.ndim == 0:
            if out is None:
                out = cuda.device_array(a_dim, dtype=out_dtype, order='F')
            elif out.shape == a_dim:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = 32
            griddim = int(ceil(a_dim[0]/blockdim))
            if alpha != 1. or beta != 1.:
                vs_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
            else:
                vs_add_pointwise[griddim,blockdim, self.stream](a,b,out)
        # Scalar-vector addition
        elif a.ndim == 0 and b.ndim == 1:
            if out is None:
                out = cuda.device_array(b_dim, dtype=out_dtype, order='F')
            elif out.shape == b_dim:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = 32
            griddim = int(ceil(b_dim[0]/blockdim))
            if alpha != 1. or beta != 1.:
                vs_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out)
            else:
                vs_add_pointwise[griddim,blockdim, self.stream](b,a,out)
        # Matrix-vector addition
        elif a.ndim == 2 and b.ndim == 1:
            if out is None:
                out = cuda.device_array(a_dim, dtype=out_dtype, order='F')
            elif out.shape == a_dim:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = (32,32)
            griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[0])))
            if b.shape[0] == a.shape[0]:
                if alpha != 1. or beta != 1.:
                    mv0_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
                else:
                    mv0_add_pointwise[griddim,blockdim, self.stream](a,b,out)
            elif b.shape[0] == a.shape[1]:
                if alpha != 1. or beta != 1.:
                    mv1_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out)
                else:
                    mv1_add_pointwise[griddim,blockdim, self.stream](a,b,out)
            else:
                raise ValueError('matricies are not aligned')
        # Vector-matrix addition
        elif a.ndim == 1 and b.ndim == 2:
            if out is None:
                out = cuda.device_array(b_dim, dtype=out_dtype, order='F')
            elif out.shape == b_dim:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = (32,32)
            griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[0])))
            if a.shape[0] == b.shape[0]:
                if alpha != 1. or beta != 1.:
                    mv0_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out)
                else:
                    mv0_add_pointwise[griddim,blockdim, self.stream](b,a,out)
            elif a.shape[0] == b.shape[1]:
                if alpha != 1. or beta != 1.:
                    mv1_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out)
                else:
                    mv1_add_pointwise[griddim,blockdim, self.stream](b,a,out)
            else:
                raise ValueError('matricies are not aligned')
        else:
            raise NotImplementedError
        return out
Ejemplo n.º 34
0
def spca_full(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int(math.ceil((4. / epsilon)**d))
    print(initNumSamples)
    maxSize = 6400

    ##actual algorithm
    opt_x = np.zeros((p, 1), dtype=float_dtype)
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()

    # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
    #                    descending=True)

    prng = curand.PRNG(stream=custr)
    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        # print(Vd.dtype)
        # print('dA', (Vd.shape[0], numSamples))
        # print('dI', (k, numSamples))

        dA = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               order='F',
                               dtype=Vd.dtype)
        dI = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               dtype=np.uint32,
                               order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype)
        dC = cuda.device_array(shape=(d, numSamples),
                               order='F',
                               dtype=Vd.dtype)

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        nn = Vd.shape[0]
        segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn
        blksz = 32
        init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)),
                     (blksz, blksz), custr](dI)
        segmented_sort(dA, dI, segments, stream=custr)

        # async_dA = dA.bind(custr)
        # async_dI = dI.bind(custr)

        # selnext = sorter.batch_argselect(dtype=dA.dtype,
        #                                  count=dA.shape[0],
        #                                  k=k,
        #                                  reverse=True)
        # for i in range(numSamples):
        #     dIi = selnext(async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # for i in range(numSamples):
        #     # radix_argselect(async_dA[:, i], k=k, stream=custr,
        #     #                 storeidx=async_dI[:, i])
        #     dIi = sorter.argselect(k, async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # Replaces: val = np.linalg.norm(a[I[-k:]])
        # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
        #                                                              daInorm)

        dA = dA.bind(custr)[-k:]
        dI = dI.bind(custr)[-k:]
        batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                Ik = dI[:, i].copy_to_host()
                aIk = dA[:, i].copy_to_host().reshape(k, 1)
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x
Ejemplo n.º 35
0
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False):
	b = cublas.Blas()
	c = cusparse.Sparse()
	descr = c.matdescr()
	(m, n) = Phi.shape
	(m, batch) = I.shape

	if L == None:
		L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False)
		print "Max eigenvalue: ." + str(L)

	L = (L**2)*4 # L = svd(Phi) -> eig(2*(Phi.T*Phi))
	invL = 1/L
	t = 1.

	#if sps.issparse(Phi):
	#	Phi = np.array(Phi.todense())

	d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F'))
	# d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F'))
	d_Phi =  cusparse.csr_matrix(Phi, dtype=np.float32)
	d_PhiT = cusparse.csr_matrix(Phi.T, dtype=np.float32) # hack because csrgemm issues with 'T'
	# d_Q = cuda.device_array((n, n), dtype=np.float32, order='F')
	d_c = cuda.device_array((n, batch), dtype=np.float32, order='F')
	d_x = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
	d_y = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
	d_x2 = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))

	# Temporary array variables
	d_t = cuda.device_array((m, batch), dtype=np.float32, order='F')
	d_t2 = cuda.device_array(n*batch, dtype=np.float32, order='F')

	#b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) 	# Q = Phi^T * Phi
	#b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y
	# c.csrgemm('T', 'N', n, n, m, descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices,
	#	descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices, descr, d_Q.data, d_Q.indptr, d_Q.indices)
	d_Q = c.csrgemm_ez(d_PhiT, d_Phi, transA='N', transB='N')
	c.csrmm('T', m, batch, n, d_Phi.nnz, -2, descr, d_Phi.data, d_Phi.indptr, d_Phi.indices,
		d_I, m, 0, d_c, n)

	blockdim = 32, 32
	griddim = int(math.ceil(n/blockdim[0])), int(math.ceil(batch/blockdim[1]))

	blockdim_1d = 256
	griddim_1d = int(math.ceil(n*batch/blockdim_1d))

	start = l2l1obj(b, c, descr, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d)
	obj2 = start

	for i in xrange(max_iterations):

		# x2 = 2*Q*y + c
		# b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2)
		c.csrmm('N', n, batch, n, d_Q.nnz, 2, descr, d_Q.data, d_Q.indptr, d_Q.indices,
			d_y, n, 0, d_x2, n)
		b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2)

		# x2 = y - invL * x2
		b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2)

		# proxOp()
		l1prox[griddim, blockdim](d_x2, invL*lambdav, d_x2)
		t2 = (1+math.sqrt(1+4*(t**2)))/2.0

		# y = x2 + ((t-1)/t2)*(x2-x)
		b.geam('N', 'N', n, batch, 1+(t-1)/t2, d_x2, (1-t)/t2, d_x, d_y)

		# x = x2
		b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x)
		t = t2

		# update objective
		obj = obj2
		obj2 = l2l1obj(b, c, descr, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d)

		if verbose:
			x2 = d_x2.copy_to_host()
			print "L1 Objective: " + str(obj2)
			# print "L1 Objective: " +  str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2))

		if np.abs(obj-obj2)/float(obj) < tol:
			break

	x2 = d_x2.copy_to_host()

	if display:
		print "FISTA Iterations: " + str(i)
		# print "L1 Objective: " + str(obj2)
		print "L1 Objective: " +  str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2))
		print "Objective delta: " + str(obj2-start)

	return x2
Ejemplo n.º 36
0
pricePath = []

for j in range(paths):
    print "Generating path: %s" % j
    # plotting lists
    LogReturns, nLogReturns = [0], [0] # log returns, normalized log returns
    xchange, xcorrelation = [], [] # change in price P, and autocorrelation
    activeTraders = [] # number of active traders
    prices = [initialPrice]
    price = initialPrice

    for i in range(steps):

        # Allocate device side array
        enterProbs = cuda.device_array(w*h, dtype=np.double, stream=stream)
        activateProbs = cuda.device_array(w*h, dtype=np.double, stream=stream)
        choiceProbs = cuda.device_array(w*h, dtype=np.double, stream=stream)
        diffuseProbs = cuda.device_array(w*h, dtype=np.double, stream=stream)

        #calculate cluster info
        cluster, clusterSize, nClust, nClustOnes = calcCluster(A) # get cluster info

        xis = cuda.device_array(nClust, dtype=np.double, stream = stream)
        eta = cuda.device_array(w*h, dtype=np.double, stream = stream) # 1 -> w*h*w*h, how to index???


        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream) #upldate grid
            dB = cuda.to_device(B, stream) #upload new locatoin
            dCluster = cuda.to_device(cluster, stream) #upload cluster grid to GPU
Ejemplo n.º 37
0
def score_sequence(seq, pssm, verbose = False, keep_strands = True, benchmark = False, blocks_per_grid = -1, threads_per_block = -1):
    """
    This function will score a sequence of nucleotides based on a PSSM by using
    a sliding window parallelized on a GPU.
    
    Args:
        seq: This must be an integer representation of the nucleotide sequence,
            where the alphabet is (A = 0, C = 1, G = 2, T = 3). It must be a 
            vector (1D array) of integers that can be cast to int32 (See: 
            numpy.int32).
        pssm: This must a vectorized PSSM where every four elements correspond 
            to one position. Make sure this can be cast to an array of float64.
        verbose: Set this to True to print performance information.
        benchmark: If set to True, the function will return information about
            the run in a dictionary at the third output variable.
        keep_strands: Whether memory should be allocated for storing which
            strand the scores come from. Set this to False if you just want the
            scores and the strands array will not be returned.
            NOTE: If this and benchmark are set to False, then the scores will
            not be returned in a tuple, meaning:
                >>> score_sequence
        blocks_per_grid: This is the blocks per grid that will be assigned to 
            the CUDA kernel. See this SO question for info on choosing this
            value: http://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid
            It defaults to the length of the sequence or the maximum number of
            blocks per grid supported by the GPU, whichever is lower.
            Set this to a negative number
        threads_per_block: Threads per block. See above. It defaults to 55% of
            the maximum number of threads per block supported by the GPU, a
            value determined experimentally. Higher values will likely result
            in failure to allocate resources to the kernel (since there will
            not be enough register space for each thread).
        
    Returns:
        scores: 1D float64 array of length (n - w + 1), where n is the length
            of the sequence and w is the window size. The value at index i of
            this array corresponds to the score of the n-mer at position i in 
            the sequence.
        strands: 1D int32 array of length (n - w + 1). The value at position i
            is either 0 or 1 corresponding to the strand of the score at that
            position where 0 means the forward strand and 1 means reverse.
        run_info: This is a dictionary that is returned if the benchmark
            parameter is set to True. It contains the following:
            >>> run_info.keys()
            ['memory_used', 'genome_size', 'runtime', 'threads_per_block', 'blocks_per_grid']
            Note that the memory_used is rather misleading if running the
            function more than once. CUDA is optimized to not transfer the same
            data from the host to the device so it will not always change. It
            may also unload other assets from memory, so the memory changed can
            be negative.
            TODO: Find a better method of calculating memory usage.
            
    Example:
        >>> pssm = np.random.uniform(-7.5, 2.0, 4 * 16) # Window size of 16
        >>> seq = np.random.randint(0, 3, 30e6) # Generate random 30 million bp sequence
        >>> scores, strands, run_info = score_sequence(seq, pssm, benchmark=True, verbose=True)
        Threads per block = 563
        Blocks per grid = 53286
        Total threads = 30000018
        Scoring... Done.
        Genome size: 3e+07 bp
        Time: 605.78 ms
        Speed: 4.95229e+07 bp/sec
        >>> scores
        array([-16.97089798, -33.48925866, -21.80381526, ..., -10.27919401,
               -32.64575614, -23.97110103])
        >>> strands
        array([1, 1, 1, ..., 1, 1, 0])
        >>> run_info
        {'memory_used': 426508288L, 'genome_size': 30000000, 'runtime': 0.28268090518054123, 'threads_per_block': 563, 'blocks_per_grid': 53286}
        
    A more interesting interpretation of the run information for performance 
    analysis is the number of bases score per second:
        >>> print "%g bases/sec" % run_info["genome_size"] / run_info["runtime"]
        1.06127e+08 bases/sec
    """
    w = int(pssm.size / 4) # width of PSSM
    n = int(seq.size) # length of the sequence being scored
    
    # Calculate the reverse-complement of the PSSM
    pssm_r = np.array([pssm[i / 4 + (3 - (i % 4))] for i in range(pssm.size)][::-1])

    # Calculate the appropriate threads per block and blocks per grid    
    if threads_per_block <= 0 or blocks_per_grid <= 0:
        # We don't use the max number of threads to avoid running out of
        # register space by saturating the streaming multiprocessors
        # ~55% was found empirically, but your mileage may vary with different GPUs
        threads_per_block = int(cuda.get_current_device().MAX_BLOCK_DIM_X * 0.55)
        
        # We saturate our grid and let the dynamic scheduler assign the blocks
        # to the discrete CUDA cores/streaming multiprocessors
        blocks_per_grid = int(math.ceil(float(n) / threads_per_block))
        if blocks_per_grid > cuda.get_current_device().MAX_GRID_DIM_X:
            blocks_per_grid = cuda.get_current_device().MAX_GRID_DIM_X
    
    if verbose:
        print "Threads per block = %d" % threads_per_block
        print "Blocks per grid = %d" % blocks_per_grid
        print "Total threads = %d" % (threads_per_block * blocks_per_grid)
    
    # Collect benchmarking info
    s = default_timer()
    start_mem = cuda.current_context().get_memory_info()[0]
    
    # Start a stream
    stream = cuda.stream()
    
    # Copy data to device
    d_pssm = cuda.to_device(pssm.astype(np.float64), stream)
    d_pssm_r = cuda.to_device(pssm_r.astype(np.float64), stream)
    d_seq = cuda.to_device(seq.astype(np.int32), stream)
    
    # Allocate memory on device to store results
    d_scores = cuda.device_array(n - w + 1, dtype=np.float64, stream=stream)
    if keep_strands:
        d_strands = cuda.device_array(n - w + 1, dtype=np.int32, stream=stream)
        
    # Run the kernel
    if keep_strands:
        cuda_score[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores, d_strands)
    else:
        cuda_score_without_strands[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores)
    
    # Copy results back to host
    scores = d_scores.copy_to_host(stream=stream)
    if keep_strands:
        strands = d_strands.copy_to_host(stream=stream)
    stream.synchronize()
    
    # Collect benchmarking info
    end_mem = cuda.current_context().get_memory_info()[0]
    t = default_timer() - s
    
    # Output info on the run if verbose parameter is true
    if verbose:
        print "Genome size: %g bp" % n
        print "Time: %.2f ms (using time.%s())" % (t * 1000, default_timer.__name__)
        print "Speed: %g bp/sec" % (n / t)
        print "Global memory: %d bytes used (%.2f%% of total)" % \
            (start_mem - end_mem, float(start_mem - end_mem) * 100 / cuda.get_current_device().get_context().get_memory_info()[1])
    
    # Return the run information for benchmarking
    run_info = {"genome_size": n, "runtime": t, "memory_used": start_mem - end_mem, \
                "blocks_per_grid": blocks_per_grid, "threads_per_block": threads_per_block}
                
    # I'm so sorry BDFL, please don't hunt me down for returning different size
    # tuples in my function
    if keep_strands:
        if benchmark:
            return (scores, strands, run_info)
        else:
            return (scores, strands)
    else:
        if benchmark:
            return (scores, run_info)
        else:
            # Careful! This won't return a tuple, so you don't need to do
            # score_sequence[0] to get the scores
            return scores
Ejemplo n.º 38
0
    def mult(self, a, b, out=None, alpha=None):
        """Pointwise multiplication of two 1D or 2D arrays.

        Parameters
        ----------
        a : array-like
            Array to multiply.
        b : array-like
            Array to multiply.
        out : DeviceNDArray (optional)
            Result will overwrite out if given.
        alpha : float
            Additional scale factor for multiplication.
        """

        if alpha is not None:
            raise NotImplementedError

        b, out_dtype = _check_array(b)
        a, out_dtype = _check_array(a)

        if type(out) == cuda.cudadrv.devicearray.DeviceNDArray:
            pass
        elif out is None:
            pass
        else:
            raise NotImplementedError

        if b.dtype == np.float32:
            pass
        else:
            raise NotImplementedError

        a_dim = a.shape
        b_dim = b.shape

        if a.ndim == 2 and b.ndim == 2:
            if a_dim[0] != b_dim[0] and a_dim[1] != b_dim[1]:
                raise ValueError('matrices are not aligned')

            if out is None:
                out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F')
            elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]:
                pass
            else:
                raise ValueError('matrices are not aligned')

            blockdim2 = (32,32)
            griddim2 = (int(ceil(a_dim[0]/blockdim2[0])),int(ceil(a_dim[1]/blockdim2[1])))
            mmultiply_pointwise[griddim2,blockdim2, self.stream](a,b,out)

        elif a.ndim == 1 and b.ndim == 1:
            if a_dim[0] != b_dim[0]:
                raise ValueError('matricies not aligned')
            if out is None:
                out = cuda.device_array(a_dim[0], dtype=out_dtype, order='F')
            elif out.shape[0] == a_dim[0]:
                pass
            else:
                raise ValueError('matrices are not aligned')
            blockdim = 32
            griddim = int(ceil(a_dim[0]/blockdim))
            vmultiply_pointwise[griddim,blockdim, self.stream](a,b,out)
        else:
            raise NotImplementedError

        return out
Ejemplo n.º 39
0
def fista(I,
          Phi,
          lambdav,
          L=None,
          tol=10e-6,
          max_iterations=200,
          display=True,
          verbose=False):
    """
	I: Images 
 	Phi: Dictionary 
 	lambdav: Sparse Penalty 
 	L = Largest eigenvalue of Phi 
 	"""

    b = numbapro.cudalib.cublas.Blas()
    (m, n) = Phi.shape
    (m, batch) = I.shape

    if L == None:
        L = scipy.sparse.linalg.svds(Phi,
                                     1,
                                     which='LM',
                                     return_singular_vectors=False)
        print "Max eigenvalue: ." + str(L)

    L = (L**2) * 2  # L = svd(Phi) -> eig(2*(Phi.T*Phi))
    invL = 1 / L
    t = 1.

    if sps.issparse(Phi):
        Phi = np.array(Phi.todense())

    d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F'))
    d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F'))
    d_Q = cuda.device_array((n, n), dtype=np.float32, order='F')
    d_c = cuda.device_array((n, batch), dtype=np.float32, order='F')
    d_x = cuda.to_device(
        np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
    d_y = cuda.to_device(
        np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
    d_x2 = cuda.to_device(
        np.array(np.zeros((n, batch), dtype=np.float32), order='F'))

    # Temporary array variables
    d_t = cuda.device_array((m, batch), dtype=np.float32, order='F')
    d_t2 = cuda.device_array(n * batch, dtype=np.float32, order='F')

    b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q)  # Q = Phi^T * Phi
    b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c)  # c = -2*Phi^T * y

    blockdim = 32, 32
    griddim = int(math.ceil(n / blockdim[0])), int(
        math.ceil(batch / blockdim[1]))

    blockdim_1d = 256
    griddim_1d = int(math.ceil(n * batch / blockdim_1d))

    start = l2l1obj(b, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d,
                    griddim_1d)
    obj2 = start

    for i in xrange(max_iterations):

        # x2 = 2*Q*y + c
        b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2)
        b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2)

        # x2 = y - invL * x2
        b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2)

        # proxOp()
        l1prox[griddim, blockdim](d_x2, invL * lambdav, d_x2)
        t2 = (1 + math.sqrt(1 + 4 * (t**2))) / 2.0

        # y = x2 + ((t-1)/t2)*(x2-x)
        b.geam('N', 'N', n, batch, 1 + (t - 1) / t2, d_x2, (1 - t) / t2, d_x,
               d_y)

        # x = x2
        b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x)
        t = t2

        # update objective
        obj = obj2
        obj2 = l2l1obj(b, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d,
                       griddim_1d)

        if verbose:
            x2 = d_x2.copy_to_host()
            print "L1 Objective: " + str(obj2)

        if np.abs(obj - obj2) / float(obj) < tol:
            break

    x2 = d_x2.copy_to_host()

    if display:
        print "FISTA Iterations: " + str(i)
        print "L1 Objective: " + str(lambdav * np.sum(np.abs(x2)) +
                                     np.sum((I - Phi.dot(x2))**2))
        print "Objective delta: " + str(obj2 - start)

    return x2
Ejemplo n.º 40
0
from numbapro import cuda
from PIL import Image

@cuda.jit("void(float32[:], float32[:])", target="gpu")
def blur(input_img, blurred_img):
  index = cuda.grid(1)
  if(index >= input_img.shape[0]):
    return
  blurred_img[index] = 4.0


if __name__ == "__main__":
  img = np.ones(100)
  blurred_img = np.zeros(100)
  d_input_img = cuda.to_device(img)
  d_blurred_img = cuda.device_array(img.shape[0])

  threads_per_block = 256
  n_blocks = (img.shape[0] + threads_per_block-1) / threads_per_block

  for num in blurred_img:
    print num
  blur[n_blocks,threads_per_block](d_input_img, d_blurred_img)

  print("####################")

  d_blurred_img.copy_to_host(blurred_img)
  for num in blurred_img:
    print num
  print("Finished")
Ejemplo n.º 41
0
def spca(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int((4. / epsilon) ** d)

    maxSize = 32000

    ##actual algorithm
    opt_x = np.zeros((p, 1))
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()
    prng = curand.PRNG(stream=custr)

    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F')
        dI = cuda.device_array(shape=(k, numSamples), dtype=np.int16, order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=np.float64)
        dC = cuda.device_array(shape=(d, numSamples), order='F')

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        batch_k_selection[numSamples, Vd.shape[0], custr](dA, dI, k)

        # Replaces: val = np.linalg.norm(a[I[-k:]])
        batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
                                                                     daInorm)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                a = gpu_slice(dA, i).reshape(p, 1)
                Ik = gpu_slice(dI, i).reshape(k, 1)
                aIk = a[Ik]
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x
Ejemplo n.º 42
0
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False):
	"""
	I: Images 
 	Phi: Dictionary 
 	lambdav: Sparse Penalty 
 	L = Largest eigenvalue of Phi 
 	"""

	b = numbapro.cudalib.cublas.Blas()
	(m, n) = Phi.shape
	(m, batch) = I.shape

	if L == None:
		L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False)
		print "Max eigenvalue: ." + str(L)

	L = (L**2)*2 # L = svd(Phi) -> eig(2*(Phi.T*Phi))
	invL = 1/L
	t = 1.

	if sps.issparse(Phi):
		Phi = np.array(Phi.todense())

	d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F'))
	d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F'))
	d_Q = cuda.device_array((n, n), dtype=np.float32, order='F')
	d_c = cuda.device_array((n, batch), dtype=np.float32, order='F')
	d_x = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
	d_y = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
	d_x2 = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))

	# Temporary array variables
	d_t = cuda.device_array((m, batch), dtype=np.float32, order='F')
	d_t2 = cuda.device_array(n*batch, dtype=np.float32, order='F')

	b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) 	# Q = Phi^T * Phi
	b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y

	blockdim = 32, 32
	griddim = int(math.ceil(n/blockdim[0])), int(math.ceil(batch/blockdim[1]))

	blockdim_1d = 256
	griddim_1d = int(math.ceil(n*batch/blockdim_1d))

	start = l2l1obj(b, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d)
	obj2 = start

	for i in xrange(max_iterations):

		# x2 = 2*Q*y + c
		b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2)
		b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2)
		
		# x2 = y - invL * x2
		b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2)

		# proxOp()						
		l1prox[griddim, blockdim](d_x2, invL*lambdav, d_x2)
		t2 = (1+math.sqrt(1+4*(t**2)))/2.0
		
		# y = x2 + ((t-1)/t2)*(x2-x)
		b.geam('N', 'N', n, batch, 1+(t-1)/t2, d_x2, (1-t)/t2, d_x, d_y)

		# x = x2
		b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x)
		t = t2

		# update objective
		obj = obj2
		obj2 = l2l1obj(b, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d)

		if verbose:
			x2 = d_x2.copy_to_host()
			print "L1 Objective: " + str(obj2)

		if np.abs(obj-obj2)/float(obj) < tol:
			break

	x2 = d_x2.copy_to_host()

	if display:
		print "FISTA Iterations: " + str(i)
		print "L1 Objective: " +  str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2))
		print "Objective delta: " + str(obj2-start)

	return x2
def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    #in_h = np.array([4, 7, 2, 6, 3, 5, 1, 0], dtype=np.uint32)
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = randint(0, 100)  #NUM_ELEMENTS - i - 1
    #in_h = np.array([6, 44, 71, 79, 94, 92, 12, 56, 47, 17, 81, 98, 84,  9, 85, 99], dtype=np.uint32)
    #in_h = np.array([85, 37, 50, 73, 51, 46, 62, 84, 65, 99, 76, 59, 73, 16, 27, 4, 75, 81, 80, 33, 73, 11, 29, 24, 81, 49, 27, 71, 74, 64, 60, 91], dtype=np.uint32)
    print in_h

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(
        ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)

    RadixGPU[number_of_blocks, threads_per_block](in_d, out_d, NUM_ELEMENTS)
    out_d.copy_to_host(out_h)
    #print "Rad = ", list(out_h)

    stride = 4
    # while stride < NUM_ELEMENTS:
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1)
    #     bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride)
    #     stride *= 2
    #     # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)
    #     # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS)
    #     # out_d = in_d
    #     out_d.copy_to_host(out_h)
    #     print "Str = ", list(out_h)
    #     break
    # # stride /= 2
    # while stride >= 4:
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1)
    #     bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride)
    #     stride /= 2
    #     cuda.synchronize()
    #
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)
    #     RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS)
    #     out_d = in_d
    #
    # out_d.copy_to_host(out_h)
    # cuda.synchronize()
    #
    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "GPU = ", list(out_h)
    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = list(in_h)  #[NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    #print "CPU = ", in_cpu
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1
    print len(in_cpu)
Ejemplo n.º 44
0
    def dot(self, a, b, out=None):
        """Takes the dot product of two 2D arrays or 1D vectors.

        Checks array type and shape. Should behave like numpy.dot(a, b).

        Parameters
        ----------
        a : array-like
            Numpy or DeviceNDArray
        b : array-like
            Numpy or DeviceNDArray
        out : DeviceNDArray (optional)
            Array will be filled with result if given.
        """

        b, out_dtype = _check_array(b)
        a, out_dtype = _check_array(a)

        if isinstance(out, cuda.cudadrv.devicearray.DeviceNDArray):
            pass
        elif out is None:
            pass
        else:
            raise NotImplementedError

        if b.dtype == np.float32:
            pass
        else:
            raise NotImplementedError

        a_dim = a.shape
        b_dim = b.shape

        if a.ndim == 2 and b.ndim == 2:
            if a_dim[1] != b_dim[0]:
                raise ValueError('matrices are not aligned')

            if out is None:
                out = cuda.device_array((a_dim[0], b_dim[1]), dtype=out_dtype, order='F')
            elif out.shape[0] == a_dim[0] and out.shape[1] == b_dim[1]:
                pass
            else:
                raise ValueError('matrices are not aligned')

            self.blas.gemm('N', 'N', a_dim[0], b_dim[1], a_dim[1], 1., a, b, 0., out)

        elif a.ndim == 2 and b.ndim == 1:
            if a_dim[1] != b_dim[0]:
                raise ValueError('matrices are not aligned')
            
            if out is None:
                out = cuda.device_array((a_dim[0]), dtype=out_dtype, order='F')
            elif out.shape[0] == a_dim[0]:
                pass
            else:
                raise ValueError('matrices are not aligned')

            self.blas.gemv('N', a_dim[0], a_dim[1], 1., a, b, 0., out)

        elif a.ndim == 1 and b.ndim == 2:
            if a_dim[0] != b_dim[0]:
                raise ValueError('matrices are not aligned')
            
            if out is None:
                out = cuda.device_array((b_dim[1]), dtype=out_dtype, order='F')
            elif out.shape[0] == b_dim[1]:
                pass
            else:
                raise ValueError('matrices are not aligned')

            self.blas.gemv('T', b_dim[0], b_dim[1], 1., b, a, 0., out)
        elif a.ndim == 1 and b.ndim == 1:
            if a_dim[0] != b_dim[0]:
                raise ValueError('matricies not aligned')
            out = self.blas.dot(a,b)
        else:
            raise NotImplementedError

        return out