Exemple #1
0
def f_second(i,pi,attribute_values,df):

    #-----------------CUDA
    #Cheetah Variables
    template = Template(f_source)
    template.DF = np.array(df[pi], dtype = np.int32)
    template.nrow = df[pi].shape[0]
    nfeat = df[pi].shape[1]
    template.ncol = nfeat
    template.ll = np.array(df[i], dtype=np.int32)

    phi_i_ = [attribute_values[item] for item in pi]
    combinations = np.array(list(itertools.product(*phi_i_)), dtype=np.int32)		
    combinations = np.array(combinations.ravel().tolist(), dtype=np.int32)
    height = combinations.shape[0]
    template.H = height
    f_kernel = nvcc_compile(template, "my_f")

    ##Threads
    block_x = np.int(height/nfeat)
    blocksize = (block_x,1,1)
    gridsize = (1,1)

    ##Kernel
    h_res = np.zeros(height/nfeat,dtype=np.float32)
    d_combinations = gpu.to_gpu(combinations)
    d_res = gpu.to_gpu(h_res)
    f_kernel(d_combinations,d_res,block=blocksize,grid=gridsize)
                
    ress = d_res.get()
    ress = list(ress)
 
    return sum(ress)
Exemple #2
0
def NNMF_gpu(X,r,tol,V=v0,W=w0,verbose=1):
    Vr = V[:,0:r].copy()
    Wr = W[0:r,:].copy()
    X_gpu = gpuarray.to_gpu(X)
    V_gpu = gpuarray.to_gpu(Vr)
    W_gpu = gpuarray.to_gpu(Wr)
    #Frobinius norm at previous step
    B_gpu = linalg.dot(V_gpu, W_gpu)
    L = linalg.norm(X_gpu-B_gpu)**2
    iteration = 0
    while 1: #update V
        V_gpu *= linalg.dot(X_gpu,linalg.transpose(W_gpu))
        V_gpu /= linalg.dot(B_gpu,linalg.transpose(W_gpu))
        B_gpu = linalg.dot(V_gpu, W_gpu)
        #update W
        W_gpu *= linalg.dot(linalg.transpose(V_gpu),X_gpu)
        W_gpu /= linalg.dot(linalg.transpose(V_gpu),B_gpu)
        B_gpu = linalg.dot(V_gpu, W_gpu)
        Lnew = linalg.norm(X_gpu-B_gpu)**2
        if abs(Lnew-L) <= tol*(L+1):
            break
        else:
            L = Lnew
            iteration += 1
            if(verbose and iteration%50==0):
                print "At iteration %i, the loss is %.2f" %(iteration, L)
    return V_gpu,W_gpu,iteration
def inline_linear_interp(amps, phases, freqs, output, df, flow, imin, start_index):
    # Note that imin and start_index are ignored in the GPU code; they are only
    # needed for CPU.
    if output.precision == 'double':
        raise NotImplementedError("Double precision linear interpolation not currently supported on CUDA scheme")
    flow = numpy.float32(flow)
    texlen = numpy.int32(len(freqs))
    fmax = numpy.float32(freqs[texlen-1])
    hlen = numpy.int32(len(output))
    (fn1, fn2, ftex, atex, ptex, nt, nb) = get_dckernel(hlen)
    freqs_gpu = gpuarray.to_gpu(freqs)
    freqs_gpu.bind_to_texref_ext(ftex, allow_offset=False)
    amps_gpu = gpuarray.to_gpu(amps)
    amps_gpu.bind_to_texref_ext(atex, allow_offset=False)
    phases_gpu = gpuarray.to_gpu(phases)
    phases_gpu.bind_to_texref_ext(ptex, allow_offset=False)
    fn1 = fn1.prepared_call
    fn2 = fn2.prepared_call
    df = numpy.float32(df)
    g_out = output.data.gpudata
    lower = zeros(nb, dtype=numpy.int32).data.gpudata
    upper = zeros(nb, dtype=numpy.int32).data.gpudata
    fn1((1, 1), (nb, 1, 1), lower, upper, texlen, df, flow, fmax)
    fn2((nb, 1), (nt, 1, 1), g_out, df, hlen, flow, fmax, texlen, lower, upper)
    pycbc.scheme.mgr.state.context.synchronize()
    return output
Exemple #4
0
def main(dtype):
    from pycuda.elementwise import get_linear_combination_kernel
    lc_kernel, lc_texrefs = get_linear_combination_kernel((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
            b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
            lc_kernel.prepared_call(x._grid, x._block,
                x.gpudata, y.gpudata, z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
    def test_cublasDgemmBatched(self):
        l, m, k, n = 11, 7, 5, 3
        A = np.random.rand(l, m, k).astype(np.float64)
        B = np.random.rand(l, k, n).astype(np.float64)

        C_res = np.einsum('nij,njk->nik',A,B)

        a_gpu = gpuarray.to_gpu(A)
        b_gpu = gpuarray.to_gpu(B)
        c_gpu = gpuarray.empty((l, m, n), np.float64)

        alpha = np.float64(1.0)
        beta = np.float64(0.0)

        a_arr = bptrs(a_gpu)
        b_arr = bptrs(b_gpu)
        c_arr = bptrs(c_gpu)

        cublas.cublasDgemmBatched(self.cublas_handle, 'n','n',
                                  n, m, k, alpha,
                                  b_arr.gpudata, n,
                                  a_arr.gpudata, k,
                                  beta, c_arr.gpudata, n, l)

        assert np.allclose(C_res, c_gpu.get())
 def compute_v_without_derivs(self, Xs, Yinvs, Ts):
     #Turn the parts of omega into gpuarrays
     Xs = np.require(Xs, dtype = np.double, requirements=['A', 'W', 'O', 'C'])
     Yinvs = np.require(Yinvs, dtype = np.double, requirements=['A', 'W', 'O', 'C'])
     Ts = np.require(Ts, dtype = np.double, requirements=['A', 'W', 'O', 'C'])
     Xs_d = gpuarray.to_gpu(Xs)
     Yinvs_d = gpuarray.to_gpu(Yinvs)
     Ts_d = gpuarray.to_gpu(Ts)
     #Determine N = the number of integer points to sum over
     #          K = the number of different omegas to compute the function at
     N = self.Sd.size/self.g
     K = Xs.size/(self.g**2)
     #Create room on the gpu for the real and imaginary finite sum calculations
     fsum_reald = gpuarray.zeros(N*K, dtype=np.double)
     fsum_imagd = gpuarray.zeros(N*K, dtype=np.double)
     #Turn all scalars into numpy data types
     Nd = np.int32(N)
     Kd = np.int32(K)
     gd = np.int32(self.g)
     blocksize = (self.tilewidth, self.tileheight, 1)
     gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1)
     self.finite_sum_without_derivs(fsum_reald, fsum_imagd, Xs_d, Yinvs_d, Ts_d,
                                    self.Sd, gd, Nd, Kd,
                                    block = blocksize,
                                    grid = gridsize)
     cuda.Context.synchronize()
     fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd)
     fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd)
     return fsums_real + 1.0j*fsums_imag
def calculate_circuit_graph_vertex_data_device(d_D, d_C, length):
    logger = logging.getLogger('eulercuda.pyeulertour.calculate_circuit_graph_vertex_data_device')
    logger.info("started.")
    mod = SourceModule("""
    __global__ void calculateCircuitGraphVertexData( unsigned int * D,unsigned int * C,unsigned int ecount){

        unsigned int tid=(blockDim.x*blockDim.y * gridDim.x*blockIdx.y) + (blockDim.x*blockDim.y*blockIdx.x)+(blockDim.x*threadIdx.y)+threadIdx.x;
        if( tid <ecount)
        {
            unsigned int c=D[tid];
            atomicExch(C+c,1);
        }
    }
    """)
    calculate_circuit_graph_vertex_data = mod.get_function('calculateCircuitGraphVertexData')
    block_dim, grid_dim = getOptimalLaunchConfiguration(length, 512)
    logger.info('block_dim = %s, grid_dim = %s' % (block_dim, grid_dim))
    np_d_D = gpuarray.to_gpu(d_D)
    np_d_C = gpuarray.to_gpu(d_C)
    calculate_circuit_graph_vertex_data(
        np_d_D,
        np_d_C,
        np.uintc(length),
        block=block_dim, grid=grid_dim
    )
    np_d_D.get(d_D)
    np_d_C.get(d_C)
    # devdata = pycuda.tools.DeviceData()
    # orec = pycuda.tools.OccupancyRecord(devdata, block_dim[0] * grid_dim[1])
    # logger.info("Occupancy = %s" % (orec.occupancy * 100))
    logger.info("Finished. Leaving.")
    return d_D, d_C
Exemple #8
0
    def compare_sampling(self, hist, reps=10):
        nbins = hist.GetNbinsX();
        xaxis = hist.GetXaxis()
        intg = hist.GetIntegral()
        cdf_y = np.empty(nbins+1, dtype=float)
        cdf_x = np.empty_like(cdf_y)

        cdf_x[0] = xaxis.GetBinLowEdge(1)
        cdf_y[0] = 0.0
        for i in xrange(1,len(cdf_x)):
            cdf_y[i] = intg[i]
            cdf_x[i] = xaxis.GetBinUpEdge(i)

        cdf_x_gpu = gpuarray.to_gpu(cdf_x.astype(np.float32))
        cdf_y_gpu = gpuarray.to_gpu(cdf_y.astype(np.float32))
        block =(128,1,1)
        grid = (128, 1)
        out_gpu = gpuarray.empty(shape=int(block[0]*grid[0]), dtype=np.float32)

        out_h = ROOT.TH1D('out_h', '', hist.GetNbinsX(), 
                          xaxis.GetXmin(),
                          xaxis.GetXmax())
        out_h.SetLineColor(ROOT.kGreen)

        for i in xrange(reps):
            self.test_sample_cdf(np.int32(i),
                                 np.int32(len(cdf_x_gpu)), 
                                 cdf_x_gpu, cdf_y_gpu, out_gpu, block=block, grid=grid)
            out = out_gpu.get()
            for v in out:
                out_h.Fill(v)

        prob = out_h.KolmogorovTest(hist)
        return prob, out_h 
 def cache_z(self, z):
     x = np.require(z.real, dtype = np.double, requirements = ['A','W','O','C'])
     y = np.require(z.imag, dtype = np.double, requirements = ['A','W','O','C'])
     xd = gpuarray.to_gpu(x)
     yd = gpuarray.to_gpu(y)
     cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes)
     cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
Exemple #10
0
def gpu_apply_row_max(X):
    """ 
    max(X) = y across the rows 

    returns the gpuarray, y
    """
    if type(X)==GPUArray:
        gX = X
    else:
        gX = to_gpu(np.asarray(X, dtype=np.float32))

    dims = np.asarray(X.shape, dtype=np.int32)

    gy = to_gpu(np.zeros(dims[0], dtype=np.float32))
    giy = to_gpu(np.zeros(dims[0], dtype=np.int32))

    if devinfo.max_block_threads >= 1024:
        blocksize = 32
    else:
        blocksize = 16

    gridsize = int(dims[0] / blocksize) + 1

    if gX.flags.c_contiguous:
        func = CUDA_Kernels.get_function("apply_rows_max")
        shared = 4*blocksize*(blocksize+2) # pad for bank conflicts
    else:
        func = CUDA_Kernels.get_function("apply_rows_max_cm")
        shared = 4*blocksize*(blocksize+1)

    func(gX, gy, giy, dims[0], dims[1], block=(blocksize, blocksize,1),
         grid = (gridsize,1), shared = shared)

    return gy, giy
Exemple #11
0
def gpu_sweep_col_mult(X, y):
    """ X * y = X across the columns """
    if type(X)==GPUArray:
        gX = X
    else:
        gX = to_gpu(np.asarray(X, dtype=np.float32))

    if type(y)==GPUArray:
        gy = y
    else:
        gy = to_gpu(np.asarray(y, dtype=np.float32))

    dims = np.asarray(X.shape, dtype=np.int32)
    if devinfo.max_block_threads >= 1024:
        blocksize = 32
    else:
        blocksize = 16

    gridsize = int(dims[0] / blocksize) + 1
    shared = 4*blocksize

    if gX.flags.c_contiguous:
        func = CUDA_Kernels.get_function("sweep_columns_mult")
    else:
        func = CUDA_Kernels.get_function("sweep_columns_mult_cm")

    func(gX, gy, dims[0], dims[1], block=(blocksize, blocksize,1),
         grid = (gridsize,1), shared = shared)

    if type(y)!=GPUArray:
        X = gX.get()
Exemple #12
0
  def __init__(self, name, type, epsW, epsB, initW, initB, momW, momB, wc, weight, bias,
      weightIncr , biasIncr, disableBprop = False):
    Layer.__init__(self, name, type, disableBprop)

    self.epsW = F(epsW)
    self.epsB = F(epsB)
    self.initW = initW
    self.initB = initB
    self.momW = F(momW)
    self.momB = F(momB)
    self.wc = F(wc)

    if weight is not None:
      self.weight = gpuarray.to_gpu(weight)#.astype(np.float32)
    else:
      self.weight = None

    if bias is not None:
      self.bias = gpuarray.to_gpu(bias).astype(np.float32)
    else:
      self.bias = None

    if self.momW > 0.0:
      if weightIncr is not None:
        self.weightIncr = gpuarray.to_gpu(weightIncr)
      else:
        self.weightIncr = None
      
      if biasIncr is not None:
        self.biasIncr = gpuarray.to_gpu(biasIncr)
      else:
        self.biasIncr = None
Exemple #13
0
def gpu_sweep_row_div(X, y):
    """ X / y = X down the rows """
    if type(X)==GPUArray:
        gX = X
    else:
        gX = to_gpu(np.asarray(X, dtype=np.float32))

    if type(y)==GPUArray:
        gy = y
    else:
        gy = to_gpu(np.asarray(y, dtype=np.float32))

    dims = np.asarray(X.shape, dtype=np.int32)
    if devinfo.max_block_threads >= 1024:
        blocksize = 32
    else:
        blocksize = 16

    gridsize = int(dims[0] / blocksize) + 1
    shared = int(4*dims[1])

    if gX.flags.c_contiguous:
        func = CUDA_Kernels.get_function("sweep_rows_div")
    else:
        func = CUDA_Kernels.get_functions("sweep_rows_div_cm")

    func(gX, gy, dims[0], dims[1], block=(blocksize, blocksize,1),
         grid = (gridsize,1), shared = shared)

    if type(y)!=GPUArray:
        X = gX.get()
 def _pre_run(self):
     assert(self.LPU_obj)
     assert(all([var in self.memory_manager.variables
                 for var in self.variables.keys()]))
     for var, d in self.variables.items():
         v_dict =  self.memory_manager.variables[var]
         if not d['uids']:
             uids = v_dict['uids'].keys()
             inds = v_dict['uids'].values()
             o = np.argsort(inds)
             d['uids'] = [uids[i] for i in o]
             self.src_inds[var] = garray.to_gpu(np.arange(len(d['uids'])))
         else:
             uids = []
             inds = []
             for uid in d['uids']:
                 try:
                     inds.append(v_dict['uids'][uid])
                     uids.append(uid)
                 except:
                     pass
             inds = np.array(inds,np.int32)
             o = np.argsort(inds)
             self.src_inds[var] = garray.to_gpu(inds[o])
             d['uids'] = [uids[i] for i in o]
         self._d_output[var] = garray.empty(len(d['uids']),
                                            v_dict['buffer'].dtype)
         d['output']=np.zeros(len(d['uids']), v_dict['buffer'].dtype)
     self.pre_run()
Exemple #15
0
  def _init_weights(self, weight_shape, bias_shape):
    if self.weight is None:
      if self.name == 'noise':
        assert(weight_shape[0] == weight_shape[1])
        self.weight = gpuarray.to_gpu(np.eye(weight_shape[0], dtype = np.float32))
      else:
        self.weight = gpuarray.to_gpu(randn(weight_shape, np.float32) * self.initW)

    if self.bias is None:
      if self.initB > 0.0:
        self.bias = gpuarray.to_gpu((np.ones(bias_shape, dtype=np.float32) * self.initB))
      else:
        self.bias = gpuarray.zeros(bias_shape, dtype=np.float32)

    Assert.eq(self.weight.shape, weight_shape) 
    Assert.eq(self.bias.shape, bias_shape) 
    
    self.weightGrad = gpuarray.zeros_like(self.weight)
    self.biasGrad = gpuarray.zeros_like(self.bias)
    
    if self.momW > 0.0:
      if self.weightIncr is None:
        self.weightIncr = gpuarray.zeros_like(self.weight)
      if self.biasIncr is None:
        self.biasIncr = gpuarray.zeros_like(self.bias)
      
      Assert.eq(self.weightIncr.shape, weight_shape) 
      Assert.eq(self.biasIncr.shape, bias_shape)
 def compute_v_without_derivs(self, Z):
     #Turn the numpy set Z into gpuarrays
     x = Z.real
     y = Z.imag
     x = np.require(x, dtype = np.double, requirements=['A','W','O','C'])
     y = np.require(y, dtype = np.double, requirements=['A','W','O','C'])
     xd = gpuarray.to_gpu(x)
     yd = gpuarray.to_gpu(y)
     self.yd = yd
     #Detemine N = the number of integer points to sum over and
     #         K = the number of values to compute the function at
     N = self.Sd.size/self.g
     K = Z.size/self.g
     #Create room on the gpu for the real and imaginary finite sum calculations
     fsum_reald = gpuarray.zeros(N*K, dtype=np.double)
     fsum_imagd = gpuarray.zeros(N*K, dtype=np.double)
     #Make all scalars into numpy data types
     Nd = np.int32(N)
     Kd = np.int32(K)
     gd = np.int32(self.g)
     blocksize = (self.tilewidth, self.tileheight, 1)
     gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1)
     self.finite_sum_without_derivs(fsum_reald, fsum_imagd, xd, yd, 
                  self.Sd, gd, Nd, Kd,
                  block = blocksize,
                  grid = gridsize)
     cuda.Context.synchronize()
     fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd)
     fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd)
     return fsums_real + 1.0j*fsums_imag
Exemple #17
0
    def test_neural_net_regression(self):
        for _ in range(20):
            N = 10000    # Number of data points
            D = 100      # Dimensionality of exogenous data
            P = 50       # Dimensionality of endogenous data

            W_true = 10 * np.random.rand(D, P) - 5
            b_true = 100 * np.random.rand(P) - 50

            X = np.random.randn(N, D)
            Y = np.dot(X, W_true) + b_true[np.newaxis, :] + np.random.randn(N, P)        

            W_lstsq = np.linalg.lstsq(np.c_[np.ones((N, 1)), X], Y)[0]
            b_lstsq = W_lstsq[0]
            W_lstsq = W_lstsq[1:]

            data_provider = BatchDataProvider(gpuarray.to_gpu(X.astype(np.float32),
                                                              allocator=memory_pool.allocate),
                                              gpuarray.to_gpu(Y.astype(np.float32),
                                                              allocator=memory_pool.allocate))

            model = NeuralNetRegression([], n_in=D, n_out=P)
            optimizer = SGD(model, SimpleSGDUpdate, 
                            data_provider, data_provider,
                            learning_rate_schedule=constant_scheduler(10.),
                            early_stopping=True)
            optimizer.run(100)

            self.assertLess(np.abs(W_lstsq - model.top_layer.W.get()).max(),
                            1e-5)
Exemple #18
0
  def project(self, fs_gpu, lamda=1e2, eta=0., alpha = 2/3,
              sparsity=None, maxfun=20, mode=None, show=False):
    """
    Estimates weighting vector (w s.t. w>=0) by minimizing
    min |B*w - f|^2 + lamda * |w|^0.66 + eta * |grad(w)|^2
    """

    if fs_gpu.__class__ != np.ndarray:
      fs = fs_gpu.get()
    else:
      fs = fs_gpu

    # Naive projection
    wproj = self.basis_host.dot(fs.reshape((fs.size,1))).astype(np.float32)
    #wproj /= float(self._intern_shape[1])
    wproj = np.reshape(wproj, self.shape)
    
    if sparsity:
      wproj = imagetools.sparsify(wproj, sparsity)

    if mode == None:
      return cua.to_gpu(wproj)

    elif mode == 'HPQ_L1':
      if fs_gpu.__class__ == np.ndarray:
        self.data_gpu = cua.to_gpu(fs_gpu)
      else:
        self.data_gpu = fs_gpu
Exemple #19
0
    def _init(self, pn, qn, wf_qn):
        super()._init(pn, qn, wf_qn)

        self._wf_q_grid_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf_q_grid))
        self._wf_gpu = gpuarray.to_gpu(N.ascontiguousarray(self._wf))

        mod = SourceModule("""
            __global__ void transform(double *ps, double *qs, double *wf_q_grid, double *wf, double *out_real, double *out_imag) {{
                int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
                int idx_y = threadIdx.y + blockIdx.y * blockDim.y;
                int idx = idx_x + idx_y * {qn};
                double qdiff, prefactor, s, c;

                if (idx_x >= {qn} || idx_y >= {pn})
                    return;

                for (int j = 0; j < {wf_qn}; j++) {{
                    qdiff = wf_q_grid[j] - qs[idx];
                    prefactor = exp({g} * qdiff * qdiff) * wf[j];
                    sincos({h} * ps[idx] * qdiff, &s, &c);

                    out_real[idx] += prefactor * c;
                    out_imag[idx] += prefactor * s;
                }}
            }}
        """.format(g=-0.5*self._gamma, h=1./HBAR, pn=pn, qn=qn, wf_qn=wf_qn))
        self._kernel = mod.get_function('transform')
        self._kernel.prepare('PPPPPP')

        self._gpu_grid, self._gpu_block = carve_array(qn, pn)
Exemple #20
0
    def add_batch(self, X, T, wc=None):
        """Add a batch of training data to an iterative solution, weighted if neeed.

        The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method.
        With parameters HH_out, HT_out, the output will be put into these matrices instead of model.

        Args:
            X (matrix): input data matrix size (N * `inputs`)
            T (matrix): output data matrix size (N * `outputs`)
            wc (vector): vector of weights for data samples, one weight per sample, size (N * 1)
            HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together
        """
        devH = self._project(X, dev=True)
        T = np.array(T, order="C", dtype=self.precision)
        devT = gpuarray.to_gpu(T)
        if wc is not None:  # apply weights if given
            w = np.array(wc**0.5, dtype=self.precision)[:, None]  # re-shape to column matrix
            devWC = gpuarray.to_gpu(w)
            misc.mult_matvec(devH, devWC, axis=0, out=devH)
            misc.mult_matvec(devT, devWC, axis=0, out=devT)

        if self.HH is None:  # initialize space for self.HH, self.HT
            self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision)
            self.HH = linalg.eye(self.L, dtype=self.precision)
            self.HH *= self.norm

        linalg.add_dot(devH, devT, self.HT, transa='T')
        if self.precision is np.float64:
            linalg.add_dot(devH, devH, self.HH, transa='T')
        else:
            cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L)
Exemple #21
0
 def parameters(self, value):
     """Update the parameters. ``value`` must have the shape
     ``(weights, biases)``"""
     self.W = value[0] if isinstance(value[0], GPUArray) else \
       gpuarray.to_gpu(value[0])
     self.b = value[1] if isinstance(value[0], GPUArray) else \
       gpuarray.to_gpu(value[1])
    def generate(self, width, height, real_axis_range, imag_axis_range, tasks):
        if not is_gpu_accelerated():
            self._logger.error(
                'No GPU acceleration is available, please use CPU.')
            return

        iterations = np.empty(width * height, np.int32)
        iterations_gpu = gpuarray.to_gpu(iterations)

        z_values = np.empty(width * height, np.float32)
        z_values_gpu = gpuarray.to_gpu(z_values)

        cmin = complex(real_axis_range[0], imag_axis_range[0])
        cmax = complex(real_axis_range[1], imag_axis_range[1])
        dc = cmax - cmin

        dx, mx = divmod(width, self._block_size[0])
        dy, my = divmod(height, self._block_size[1])
        grid_size = ((dx + (mx > 0)), (dy + (my > 0)))

        self._get_pixel_iterations(
            iterations_gpu, z_values_gpu,
            np.int32(width), np.int32(height),
            np.complex64(cmin), np.complex64(dc),
            block=self._block_size, grid=grid_size)

        return (iterations_gpu, z_values_gpu, abs(dc))
def cuda_dot3(A, b):
    print("cuda_dot3", A.shape, b.shape)
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    # transpose b on GPU
    bt_gpu = linalg.transpose(b_gpu)
    #remove b for now
    b_gpu.gpudata.free()
    del(b_gpu)
    # send A to GPU    
    A_gpu = gpuarray.to_gpu(A)
    
    temp_gpu = linalg.dot(bt_gpu, A_gpu)
    
    bt_gpu.gpudata.free()
    del(bt_gpu)
    A_gpu.gpudata.free()
    del(A_gpu)
    
    # send b to GPU
    b_gpu = gpuarray.to_gpu(b)
    
    c_gpu = linalg.dot(temp_gpu, b_gpu)
    
    temp_gpu.gpudata.free()
    del(temp_gpu)
    b_gpu.gpudata.free()
    del(b_gpu)
        
    #theoretically possible to move into RAM, force cleanup on GPU and then return from RAM
    #but most likely not necessary
    return c_gpu.get()
Exemple #24
0
    def set_by_inds(self, inds, data):
        """
        Set mapped data by integer indices.

        Parameters
        ----------
        inds : sequence of int
            Integer indices of data elements to update.
        data : numpy.ndarray
            Data to assign.
        """

        assert len(np.shape(inds)) == 1
        assert issubclass(inds.dtype.type, numbers.Integral)
        N = len(inds)
        assert N == len(data)

        if not isinstance(inds, gpuarray.GPUArray):
            inds = gpuarray.to_gpu(inds)
        if not isinstance(data, gpuarray.GPUArray):
            data = gpuarray.to_gpu(data)

        # Allocate data array if it doesn't exist:
        if not self.data:
            self.data = gpuarray.empty(N, data.dtype)
        else:
            assert self.data.dtype == data.dtype
        try:
            func = self.set_by_inds.cache[inds.dtype]
        except KeyError:
            inds_ctype = tools.dtype_to_ctype(inds.dtype)
            v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(data_ctype=self.data_ctype, inds_ctype=inds_ctype)        
            func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]")
            self.set_by_inds.cache[inds.dtype] = func
        func(self.data, inds, data, range=slice(0, N, 1))
def main():
    import numpy as np
    import pycuda.autoinit
    from pycuda import gpuarray
    from skdata import toy
    from hebel import memory_pool
    from hebel.data_providers import BatchDataProvider
    from hebel.models import NeuralNetRegression
    from hebel.optimizers import SGD
    from hebel.parameter_updaters import SimpleSGDUpdate
    from hebel.monitors import SimpleProgressMonitor
    from hebel.schedulers import exponential_scheduler

    # Get data
    data_cpu, targets_cpu = toy.Boston().regression_task()
    data = gpuarray.to_gpu(data_cpu.astype(np.float32), allocator=memory_pool.allocate)
    targets = gpuarray.to_gpu(targets_cpu.astype(np.float32), allocator=memory_pool.allocate)
    data_provider = BatchDataProvider(data, targets)

    # Create model object
    model = NeuralNetRegression(n_in=data_cpu.shape[1], n_out=targets_cpu.shape[1],
                                layers=[100], activation_function='relu')
    
    # Create optimizer object
    optimizer = SGD(model, SimpleSGDUpdate, data_provider, data_provider,
                    learning_rate_schedule=exponential_scheduler(.1, .9999),
                    early_stopping=True)
    optimizer.run(3000)
Exemple #26
0
    def _initialize_gpu_ds(self):
        """
        Setup GPU arrays.
        """

        self.synapse_state = garray.zeros(int(self.total_synapses) + \
                                    len(self.input_neuron_list), np.float64)
        if self.my_num_gpot_neurons>0:
            self.V = garray.zeros(int(self.my_num_gpot_neurons), np.float64)
        else:
            self.V = None

        if self.my_num_spike_neurons>0:
            self.spike_state = garray.zeros(int(self.my_num_spike_neurons), np.int32)

        if len(self.public_gpot_list)>0:
            self.public_gpot_list_g = garray.to_gpu(self.public_gpot_list)
            self.projection_gpot = garray.zeros(len(self.public_gpot_list), np.double)
            self._extract_gpot = self._extract_projection_gpot_func()

        if len(self.public_spike_list)>0:
            self.public_spike_list_g = garray.to_gpu( \
                (self.public_spike_list-self.spike_shift).astype(np.int32))
            self.projection_spike = garray.zeros(len(self.public_spike_list), np.int32)
            self._extract_spike = self._extract_projection_spike_func()
Exemple #27
0
def cuda_ageSols(sols):
    """ makes solutions to age """

    #get num sols
    num_sols = len(sols);
    
    
    
    #convert to form of numpy arrays
    sols_arr = numpy.array(sols, numpy.float32);
    ones_arr = numpy.zeros_like(sols,numpy.float32);
    ones_arr[:,constants.AGE_GENE] = 1;
    
    #copy each to gpu
    sols_gpu = gpuarray.to_gpu(sols_arr);
    mask_gpu = gpuarray.to_gpu(ones_arr);
    
    #debug
    if debug == True:
        print mask_gpu.view();
    
    #apply mask
    aged_sols_gpu = sols_gpu + mask_gpu;
    
    sols = aged_sols_gpu.get().tolist();
Exemple #28
0
 def test_set_by_inds_from_inds(self):
     dest_gpu = gpuarray.to_gpu(np.zeros(5, dtype=np.float32))
     ind_dest = gpuarray.to_gpu(np.array([0, 2, 4]))
     src_gpu =  gpuarray.to_gpu(np.arange(5, 10, dtype=np.float32))
     ind_src =  gpuarray.to_gpu(np.array([2, 3, 4]))
     gpu.set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src)
     assert np.allclose(dest_gpu.get(), np.array([7, 0, 8, 0, 9], dtype=np.float32))
Exemple #29
0
def main_no_tex(dtype):
    lc_kernel = get_lin_comb_kernel_no_tex((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10,26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            lc_kernel.prepared_call(x._grid, x._block,
                a.gpudata, x.gpudata,
                b.gpudata, y.gpudata,
                z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
Exemple #30
0
    def add_neurons(self, number, func, W, B):
        """Add prepared neurons to the SLFN, merge with existing ones.

        Adds a number of specific neurons to SLFN network. Weights and biases
        must be provided for that function.

        If neurons of such type already exist, they are merged together.

        Args:
            number (int): the number of new neurons to add
            func (str): transformation function of hidden layer. Linear function creates a linear model.
            W (matrix): a 2-D matrix of neuron weights, size (`inputs` * `number`)
            B (vector): a 1-D vector of neuron biases, size (`number` * 1)
        """
        ntypes = [nr[1] for nr in self.neurons]  # existing types of neurons
        if func in ntypes:
            # add to an existing neuron type
            i = ntypes.index(func)
            nn0, _, devW, devB = self.neurons[i]
            number = nn0 + number
            devW = gpuarray.to_gpu(np.hstack((devW.get(), W)))
            devB = gpuarray.to_gpu(np.hstack((devB.get(), B)))
            self.neurons[i] = (number, func, devW, devB)
        else:
            # create a new neuron type
            devW = gpuarray.to_gpu(W)
            devB = gpuarray.to_gpu(B)
            self.neurons.append((number, func, devW, devB))
        self.reset()
        self.B = None
Exemple #31
0
 def copy_to_GPU(self):
     self.data_on_GPU = gpuarray.to_gpu(self.data.astype(np.float32))
Exemple #32
0
def dpLSA(lamda, theta, p, X, denominators, shape_image, N, M, K, iters, r1,
          r2):
    lamda_gpu = to_gpu(lamda)
    theta_gpu = to_gpu(theta)
    p_gpu = to_gpu(p)
    X_gpu = to_gpu(X)
    den_gpu = to_gpu(denominators)
    steps = int(ceil(N / 1024.0))
    stepsP = int(ceil(K / 50.0))

    EStep = kernels.get_function("EStepDPLSA")
    LamdaComputing = kernels.get_function("Lamda")
    ThetaComputing = kernels.get_function("Theta1")
    CalculeDiv = kernels.get_function("Theta2")
    ThetaDivision = kernels.get_function("Theta3")

    pycuda.driver.Context.synchronize()
    inicio = time.time()

    for i in range(iters):
        EStep(p_gpu,
              theta_gpu,
              lamda_gpu,
              np.uint32(stepsP),
              np.uint32(K),
              block=(1, 1, 50),
              grid=(N, M, 1))
        ThetaComputing(theta_gpu,
                       X_gpu,
                       p_gpu,
                       np.float32(r1),
                       np.uint32(steps),
                       np.uint32(N),
                       np.uint32(M),
                       den_gpu,
                       block=(1024, 1, 1),
                       grid=(1, M, K))
        CalculeDiv(theta_gpu, den_gpu, block=(1, M, 1), grid=(1, 1, K))
        ThetaDivision(theta_gpu, den_gpu, block=(1, 1, 1), grid=(1, M, K))
        LamdaComputing(lamda_gpu,
                       X_gpu,
                       p_gpu,
                       np.uint32(K),
                       np.float32(r2),
                       block=(1, M, 1),
                       grid=(N, 1, K))

    pycuda.driver.Context.synchronize()

    endTime = time.time() - inicio

    abundances = lamda_gpu.get().reshape(shape_image[0], shape_image[1], K)
    endmembers = theta_gpu.get().T

    lamda_gpu.gpudata.free()
    theta_gpu.gpudata.free()
    X_gpu.gpudata.free()
    p_gpu.gpudata.free()
    den_gpu.gpudata.free()

    return (abundances, endmembers, endTime)
Exemple #33
0
 def create_from_numpy(self, arr):
     return gpuarray.to_gpu(arr.astype(self.dtype))
Exemple #34
0
 def get_seeds(n):
     return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
from time import time
from pycuda.compiler import SourceModule


#%%
def set_module():
    return SourceModule(
        "__global__ void scalar_multiply_kernel( int *outvect, int scalar,int *vec){int j = threadIdx.x;outvect[j] =  scalar*vec[j];}"
    )


#%%
ker = set_module()
smg = ker.get_function("scalar_multiply_kernel")
t_vec = np.random.randint(low=0, high=3, size=1500).astype(np.int32)
t_vec_gpu = gpuarray.to_gpu(t_vec)
o_vec = gpuarray.empty_like(t_vec_gpu)
smg(o_vec,
    np.int32(2),
    t_vec_gpu,
    block=(150, 1, 1),
    grid=(int(1500 / 150), 1, 1))
print(np.allclose(o_vec.get(), 2 * t_vec))

#%%
print(o_vec.get())

print(2 * t_vec)

#%%
Exemple #36
0
h_a = h_a.astype(np.float32)
h_b = h_b.astype(np.float32)
h_c = np.empty_like(h_a)

mod = SourceModule("""
__global__ void deviceElementwise(float * __restrict__ d_c, const float * __restrict__ d_a, 
                                                    const float * __restrict__ d_b,
                                                    const int N)
{
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid >= N) return;
  d_c[tid] = sqrt(fabs(d_a[tid])) + exp(d_b[tid]);
}
""")

d_a = gpuarray.to_gpu(h_a)
d_b = gpuarray.to_gpu(h_b)
d_c = gpuarray.zeros_like(d_a)

# --- Define a reference to the __global__ function and call it
deviceElementwise = mod.get_function("deviceElementwise")
blockDim = (BLOCKSIZE, 1, 1)
gridDim = (iDivUp(N, BLOCKSIZE), 1, 1)
start.record()
deviceElementwise(d_c, d_a, d_b, np.int32(N), block=blockDim, grid=gridDim)
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Processing time = %fs" % (secs))

h_c = d_c.get()
Exemple #37
0
            bestcost = c;
            viable = via1;
        }
    }

    V[di] = bestcost;
}
""" % (STEER_LIMIT_K, NANGLES, NSPEEDS, GRID_RES, xsize, ysize, VMIN, VMAX,
       AxMAX, AyMAX, TIMESTEP, homex, homey))

valueiter = mod.get_function("valueiter")
V0_gpu = cuda.mem_alloc(NSPEEDS * NANGLES * xsize * ysize * 4)
V = np.zeros((NSPEEDS, NANGLES, ysize, xsize), np.float32) + 1000.
cuda.memcpy_htod(V0_gpu, V)

ye_in = gpuarray.to_gpu(ye)
del ye

s = trange(20)
v0 = np.sum(V, dtype=np.float64)
for j in s:
    for i in range(20):
        valueiter(V0_gpu,
                  V0_gpu,
                  ye_in,
                  block=(16, 8, 1),
                  grid=(xsize // 16, ysize // 8, NANGLES * NSPEEDS))
    cuda.memcpy_dtoh(V, V0_gpu)
    v1 = np.sum(V, dtype=np.float64)
    dv = v1 - v0
    v0 = v1
Exemple #38
0
    def dwt2d(self,
              input_array,
              depth=1,
              gpu_output=False,
              gpu_allocator=cuda.mem_alloc):
        """
        dwt2d

        Perform a 2D forward discrete wavelet transform.
        If an array has more than 2 dimensions, this function will perform a batched transform.

        Parameters
        ----------
        input_array: Numpy array
            Input array array of data to be transformed.

            If the array is 2D, this function will perform a 2D forward DWT on the data.

            If the array has more than 2 dimensions, this function will perform a batched 2D forward DWT
            for the last 2 dimensions of the array.
        depth: int, optional
            Depth level of transform; must be greater than or equal to 0 (1 is the default)
        gpu_output: bool, optional
            If True, then return the coefficients as a list of GPUArray objects. (False is the default)
        gpu_allocator: callable, optional
            Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default)

        Returns
        -------
        list of Numpy arrays or GPUArray objects
            A list of numpy arrays of the DWT coefficients either in float32 or float64,
            depending on whether use_float32 was set to True or False for PycudaWaveletTransform.
            The coefficients are organized as
            [c_lo_lo, (c_lo_hi_depth, c_hi_lo_depth, c_hi_hi_depth), ... , (c_lo_hi_1, c_hi_lo_1, c_hi_hi_1)]
        """
        if not isinstance(input_array, numpy.ndarray) and not isinstance(
                input_array, gpuarray.GPUArray):
            raise TypeError('invalid input array type')

        if depth < 0:
            raise RuntimeError('invalid depth value')
        elif depth == 0:
            return [input_array]

        in_shape = input_array.shape
        if len(in_shape) < 2:
            raise RuntimeError('input array must have 2 or more dimensions')

        num_cols = int(in_shape[-1])
        num_rows = int(in_shape[-2])
        num_slices = int(1)
        if len(in_shape) > 2:
            for i in in_shape[:-2]:
                num_slices *= i

        # Get size of output arrays
        out_cols = [
            int((num_cols + 1) / 2 if num_cols % 2 else (num_cols / 2))
        ]
        out_rows = [
            int((num_rows + 1) / 2 if num_rows % 2 else (num_rows / 2))
        ]
        for i in range(depth - 1):
            pc = out_cols[-1]
            nc = int((pc + 1) / 2 if pc % 2 else (pc / 2))
            out_cols.append(int(nc))
            pr = out_rows[-1]
            nr = int((pr + 1) / 2 if pr % 2 else (pr / 2))
            out_rows.append(int(nr))

        # If input is in Numpy array, then copy it to a GPUArray object.
        if isinstance(input_array, numpy.ndarray):
            cont_input_array = numpy.ascontiguousarray(input_array,
                                                       dtype=self._dtype)
            input_device_array = gpuarray.to_gpu(cont_input_array)
        else:
            if input_array.dtype != self._dtype:
                input_device_array = input_array.astype(self._dtype)
            else:
                input_device_array = input_array
            if not input_device_array.flags.c_contiguous:
                input_device_array = input_device_array.reshape(in_shape,
                                                                order='C')

        # Allocate device arrays
        row_approx_device_array = gpuarray.GPUArray(
            [num_slices, num_rows, out_cols[0]],
            dtype=self._dtype,
            allocator=gpu_allocator)
        row_detail_device_array = gpuarray.GPUArray(
            [num_slices, num_rows, out_cols[0]],
            dtype=self._dtype,
            allocator=gpu_allocator)
        output_device_arrays = []
        for r, c in zip(out_rows, out_cols):
            output_device_arrays.append(
                dict(ll=gpuarray.GPUArray([num_slices, r, c],
                                          dtype=self._dtype,
                                          allocator=gpu_allocator),
                     hl=gpuarray.GPUArray([num_slices, r, c],
                                          dtype=self._dtype,
                                          allocator=gpu_allocator),
                     lh=gpuarray.GPUArray([num_slices, r, c],
                                          dtype=self._dtype,
                                          allocator=gpu_allocator),
                     hh=gpuarray.GPUArray([num_slices, r, c],
                                          dtype=self._dtype,
                                          allocator=gpu_allocator)))

        # Transform
        row_block = (256, 1, 1)
        row_grid_x = int(out_cols[0] / row_block[0]) + (1 if out_cols[0] %
                                                        row_block[0] else 0)
        row_grid_y = int(
            num_rows / row_block[1]) + (1 if num_rows % row_block[1] else 0)
        row_grid_z = int(num_slices / row_block[2]) + (1 if num_slices %
                                                       row_block[2] else 0)
        row_grid = (row_grid_x, row_grid_y, row_grid_z)
        shared_mem_size = (row_block[2] * row_block[1] *
                           (self._dec_length + 2 *
                            (row_block[0] - 1)) * self._dtype().itemsize)
        self._dwt_row(input_device_array,
                      row_approx_device_array,
                      row_detail_device_array,
                      numpy.int32(num_rows),
                      numpy.int32(num_cols),
                      numpy.int32(num_slices),
                      block=row_block,
                      grid=row_grid,
                      shared=shared_mem_size)

        col_block = (256, 1, 1)
        col_grid_x = int(out_cols[0] / col_block[0]) + (1 if out_cols[0] %
                                                        col_block[0] else 0)
        col_grid_y = int(out_rows[0] / col_block[1]) + (1 if out_rows[0] %
                                                        col_block[1] else 0)
        col_grid_z = int(num_slices / col_block[2]) + (1 if num_slices %
                                                       col_block[2] else 0)
        col_grid = (col_grid_x, col_grid_y, col_grid_z)
        self._dwt_col(row_approx_device_array,
                      row_detail_device_array,
                      output_device_arrays[0]['ll'],
                      output_device_arrays[0]['lh'],
                      output_device_arrays[0]['hl'],
                      output_device_arrays[0]['hh'],
                      numpy.int32(num_rows),
                      numpy.int32(out_cols[0]),
                      numpy.int32(num_slices),
                      block=col_block,
                      grid=col_grid,
                      shared=0)

        for d in range(1, depth):
            row_grid_x = int(out_cols[d] / row_block[0]) + (
                1 if out_cols[d] % row_block[0] else 0)
            row_grid_y = int(out_rows[d - 1] / row_block[1]) + (
                1 if out_rows[d - 1] % row_block[1] else 0)
            row_grid = (row_grid_x, row_grid_y, row_grid_z)
            self._dwt_row(output_device_arrays[d - 1]['ll'],
                          row_approx_device_array,
                          row_detail_device_array,
                          numpy.int32(out_rows[d - 1]),
                          numpy.int32(out_cols[d - 1]),
                          numpy.int32(num_slices),
                          block=row_block,
                          grid=row_grid,
                          shared=shared_mem_size)

            col_grid_x = int(out_cols[d] / col_block[0]) + (
                1 if out_cols[d] % col_block[0] else 0)
            col_grid_y = int(out_rows[d] / col_block[1]) + (
                1 if out_rows[d] % col_block[1] else 0)
            col_grid = (col_grid_x, col_grid_y, col_grid_z)
            self._dwt_col(row_approx_device_array,
                          row_detail_device_array,
                          output_device_arrays[d]['ll'],
                          output_device_arrays[d]['lh'],
                          output_device_arrays[d]['hl'],
                          output_device_arrays[d]['hh'],
                          numpy.int32(out_rows[d - 1]),
                          numpy.int32(out_cols[d]),
                          numpy.int32(num_slices),
                          block=col_block,
                          grid=col_grid,
                          shared=0)

        # Get results from device
        approx_array = output_device_arrays[-1][
            'll'] if gpu_output else output_device_arrays[-1]['ll'].get()
        if len(in_shape) > 2:
            new_shape = list(in_shape[:-2])
            new_shape.append(out_rows[-1])
            new_shape.append(out_cols[-1])
            approx_array = approx_array.reshape(new_shape)
        results = [approx_array]
        for d, r, c in zip(output_device_arrays[::-1], out_rows[::-1],
                           out_cols[::-1]):
            detail_lh_array = d['lh'] if gpu_output else d['lh'].get()
            detail_hl_array = d['hl'] if gpu_output else d['hl'].get()
            detail_hh_array = d['hh'] if gpu_output else d['hh'].get()
            if len(in_shape) > 2:
                new_shape = list(in_shape[:-2])
                new_shape.append(r)
                new_shape.append(c)
                detail_lh_array = detail_lh_array.reshape(new_shape)
                detail_hl_array = detail_hl_array.reshape(new_shape)
                detail_hh_array = detail_hh_array.reshape(new_shape)
            results.append((detail_lh_array, detail_hl_array, detail_hh_array))

        return results
Exemple #39
0
    def dwt1d(self,
              input_array,
              depth=1,
              gpu_output=False,
              gpu_allocator=cuda.mem_alloc):
        """
        dwt1d

        Perform a 1D forward discrete wavelet transform.
        If an array has multiple rows, this function will perform a batched transform.

        Parameters
        ----------
        input_array: Numpy array or GPUArray object
            Input array array of data to be transformed.

            If the array is 1D, this function will perform a 1D forward DWT on the data.

            If the array is multidimensional, this function will perform a batched 1D forward DWT
            for all rows of the array.

            The rows being the last dimension of the array.
        depth: int, optional
            Depth level of transform; must be greater than or equal to 0 (1 is the default)
        gpu_output: bool, optional
            If True, then return the coefficients as a list of GPUArray objects. (False is the default)
        gpu_allocator: callable, optional
            Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default)

        Returns
        -------
        list of Numpy arrays or GPUArray objects
            A list of numpy arrays of the DWT coefficients either in float32 or float64,
            depending on whether use_float32 was set to True or False for PycudaWaveletTransform.
            The coefficients are organized as [c_lo, c_hi_depth, c_hi_depth-1, ... , c_hi_2, c_hi_1]
        """
        if not isinstance(input_array, numpy.ndarray) and not isinstance(
                input_array, gpuarray.GPUArray):
            raise TypeError('invalid input array type')

        if depth < 0:
            raise RuntimeError('invalid depth value')
        elif depth == 0:
            return [input_array]

        in_shape = input_array.shape
        row_size = int(in_shape[-1])
        num_rows = int(1)
        if len(in_shape) > 1:
            for i in in_shape[:-1]:
                num_rows *= i

        # If input is in Numpy array, then copy it to a GPUArray object.
        if isinstance(input_array, numpy.ndarray):
            cont_input_array = numpy.ascontiguousarray(input_array,
                                                       dtype=self._dtype)
            input_device_array = gpuarray.to_gpu(cont_input_array)
        else:
            if input_array.dtype != self._dtype:
                input_device_array = input_array.astype(self._dtype)
            else:
                input_device_array = input_array
            if not input_device_array.flags.c_contiguous:
                input_device_array = input_device_array.reshape(in_shape,
                                                                order='C')

        # Get size of output arrays
        out_sizes = [
            int((row_size + 1) / 2 if row_size % 2 else (row_size / 2))
        ]
        for i in range(depth - 1):
            ps = out_sizes[-1]
            ns = int((ps + 1) / 2 if ps % 2 else (ps / 2))
            out_sizes.append(int(ns))

        # Allocate device arrays
        detail_device_arrays = []
        approx_device_arrays = []
        for s in out_sizes:
            detail_device_arrays.append(
                gpuarray.GPUArray([num_rows, s],
                                  dtype=self._dtype,
                                  allocator=gpu_allocator))
            approx_device_arrays.append(
                gpuarray.GPUArray([num_rows, s],
                                  dtype=self._dtype,
                                  allocator=gpu_allocator))

        # Transform
        block = (256, 1, 1)
        grid_x = int(
            out_sizes[0] / block[0]) + (1 if out_sizes[0] % block[0] else 0)
        grid_y = int(num_rows / block[1]) + (1 if num_rows % block[1] else 0)
        grid = (grid_x, grid_y, 1)
        shared_mem_size = block[2] * block[1] * (
            self._dec_length + 2 * (block[0] - 1)) * self._dtype().itemsize
        self._dwt_row(input_device_array,
                      approx_device_arrays[0],
                      detail_device_arrays[0],
                      numpy.int32(num_rows),
                      numpy.int32(row_size),
                      numpy.int32(1),
                      block=block,
                      grid=grid,
                      shared=shared_mem_size)

        for d in range(1, depth):
            grid_x = int(out_sizes[d] / block[0]) + (1 if out_sizes[d] %
                                                     block[0] else 0)
            grid = (grid_x, grid_y, 1)
            self._dwt_row(approx_device_arrays[d - 1],
                          approx_device_arrays[d],
                          detail_device_arrays[d],
                          numpy.int32(num_rows),
                          numpy.int32(out_sizes[d - 1]),
                          numpy.int32(1),
                          block=block,
                          grid=grid,
                          shared=shared_mem_size)

        # Get results from device
        approx_array = approx_device_arrays[
            -1] if gpu_output else approx_device_arrays[-1].get()
        if len(in_shape) > 1:
            new_shape = list(in_shape[:-1])
            new_shape.append(out_sizes[-1])
            approx_array = approx_array.reshape(new_shape)
        results = [approx_array]
        for i, (d, s) in enumerate(
                zip(detail_device_arrays[::-1], out_sizes[::-1])):
            detail_array = d if gpu_output else d.get()
            if len(in_shape) > 1:
                new_shape = list(in_shape[:-1])
                new_shape.append(s)
                detail_array = detail_array.reshape(new_shape)
            results.append(detail_array)

        return results
Exemple #40
0
    def idwt2d(self,
               input_list,
               gpu_output=False,
               gpu_allocator=cuda.mem_alloc):
        """
        idwt2d

        Perform a 2D inverse discrete wavelet transform.
        If the arrays have more than 2 dimensions, this function will perform a batched transform.

        Parameters
        ----------
        input_list: list of Numpy arrays
            A list of numpy arrays of the DWT coefficients to be reconstructed.

            The coefficients are organized as
            [c_lo_lo, (c_lo_hi_depth, c_hi_lo_depth, c_hi_hi_depth), ... , (c_lo_hi_1, c_hi_lo_1, c_hi_hi_1)].

            The detail array can have dimensions that are one less than that of the approximation array for each level.

            If the arrays have more than 2 dimensions, this function will perform a batched 2D inverse DWT
            for the last 2 dimensions of the arrays.  All arrays must have the same dimensions
            except for the last 2 dimensions.
        gpu_output: bool, optional
            If True, then return the reconstructed signal as a GPUArray object. (False is the default)
        gpu_allocator: callable, optional
            Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default)
        Returns
        -------
        Numpy array or GPUArray object
            A Numpy array of the reconstructed signal either in float32 or float64,
            depending on whether use_float32 was set to True or False for PycudaWaveletTransform.
        """
        if not isinstance(input_list, list):
            raise TypeError('invalid input_list type')
        if not isinstance(input_list[0], numpy.ndarray) and not isinstance(
                input_list[0], gpuarray.GPUArray):
            raise TypeError('invalid input_list type')
        if len(input_list) > 1:
            approx_shape = list(input_list[0].shape)
            for dl in input_list[1:]:
                if not isinstance(dl, list) and not isinstance(dl, tuple):
                    raise TypeError('invalid input_list type')
                elif len(dl) != 3:
                    raise TypeError('invalid input_list type')
                if any([
                        not isinstance(x, numpy.ndarray)
                        and not isinstance(x, gpuarray.GPUArray) for x in dl
                ]):
                    raise TypeError('invalid input_list type')
                detail_shape = list(dl[0].shape)
                if list(dl[1].shape) != detail_shape or list(
                        dl[2].shape) != detail_shape:
                    raise RuntimeError('arrays have incompatible shapes')
                if detail_shape[-1] < approx_shape[-1] - 1 or approx_shape[
                        -1] < detail_shape[-1]:
                    raise RuntimeError('arrays have incompatible shapes')
                if detail_shape[-2] < approx_shape[-2] - 1 or approx_shape[
                        -2] < detail_shape[-2]:
                    raise RuntimeError('arrays have incompatible shapes')
                if len(detail_shape) > 2:
                    if approx_shape[:-2] != detail_shape[:-2]:
                        raise RuntimeError('arrays have incompatible shapes')
                approx_shape[-1] = 2 * detail_shape[-1]
                approx_shape[-2] = 2 * detail_shape[-2]

        depth = len(input_list) - 1

        if depth == 0:
            return input_list[0]

        in_shape = input_list[0].shape
        num_cols = int(in_shape[-1])
        num_rows = int(in_shape[-2])
        num_slices = int(1)
        if len(in_shape) > 2:
            for i in in_shape[:-2]:
                num_slices *= i

        # Get size of input_list arrays
        in_rows = []
        in_cols = []
        out_rows = []
        out_cols = []
        for i in range(depth):
            c = input_list[i + 1][0].shape[-1]
            r = input_list[i + 1][0].shape[-2]
            in_rows.append(int(r))
            in_cols.append(int(c))
            out_rows.append(int(2 * r))
            out_cols.append(int(2 * c))

        # Allocate device arrays
        approx_device_array = gpuarray.GPUArray(
            [num_slices, out_rows[-1], out_cols[-1]],
            dtype=self._dtype,
            allocator=gpu_allocator)
        row_approx_device_array = gpuarray.GPUArray(
            [num_slices, out_rows[-1], in_cols[-1]],
            dtype=self._dtype,
            allocator=gpu_allocator)
        row_detail_device_array = gpuarray.GPUArray(
            [num_slices, out_rows[-1], in_cols[-1]],
            dtype=self._dtype,
            allocator=gpu_allocator)

        # If input is in Numpy arrays, then copy it to GPUArray objects.
        input_device_list = []
        for input_index, input_array in enumerate(input_list):
            if input_index == 0:
                if isinstance(input_array, numpy.ndarray):
                    cont_input_array = numpy.ascontiguousarray(
                        input_array, dtype=self._dtype)
                    input_device_array = gpuarray.to_gpu(cont_input_array)
                else:
                    if input_array.dtype != self._dtype:
                        input_device_array = input_array.astype(self._dtype)
                    else:
                        input_device_array = input_array
                    if not input_device_array.flags.c_contiguous:
                        input_device_array = input_device_array.reshape(
                            input_array.shape, order='C')
                input_device_list.append(input_device_array)
            else:
                detail_list = []
                for detail_array in input_array:
                    if isinstance(detail_array, numpy.ndarray):
                        cont_input_array = numpy.ascontiguousarray(
                            detail_array, dtype=self._dtype)
                        input_device_array = gpuarray.to_gpu(cont_input_array)
                    else:
                        if detail_array.dtype != self._dtype:
                            input_device_array = detail_array.astype(
                                self._dtype)
                        else:
                            input_device_array = detail_array
                        if not input_device_array.flags.c_contiguous:
                            input_device_array = input_device_array.reshape(
                                detail_array.shape, order='C')
                    detail_list.append(input_device_array)
                input_device_list.append(detail_list)

        # Transform
        col_block = (256, 1, 1)
        col_grid_x = int(in_cols[0] / col_block[0]) + (1 if in_cols[0] %
                                                       col_block[0] else 0)
        col_grid_y = int(in_rows[0] / col_block[1]) + (1 if in_rows[0] %
                                                       col_block[1] else 0)
        col_grid_z = int(num_slices / col_block[2]) + (1 if num_slices %
                                                       col_block[2] else 0)
        col_grid = (col_grid_x, col_grid_y, col_grid_z)
        self._idwt_col(input_device_list[0],
                       input_device_list[1][0],
                       input_device_list[1][1],
                       input_device_list[1][2],
                       row_approx_device_array,
                       row_detail_device_array,
                       numpy.int32(num_rows),
                       numpy.int32(num_cols),
                       numpy.int32(in_rows[0]),
                       numpy.int32(in_cols[0]),
                       numpy.int32(num_slices),
                       block=col_block,
                       grid=col_grid,
                       shared=0)

        row_block = (256, 1, 1)
        row_grid_x = int(in_cols[0] / row_block[0]) + (1 if in_cols[0] %
                                                       row_block[0] else 0)
        row_grid_y = int(out_rows[0] / row_block[1]) + (1 if out_rows[0] %
                                                        row_block[1] else 0)
        row_grid_z = int(num_slices / row_block[2]) + (1 if num_slices %
                                                       row_block[2] else 0)
        row_grid = (row_grid_x, row_grid_y, row_grid_z)
        shared_mem_size = (row_block[2] * row_block[1] *
                           (2 *
                            (int(self._dec_length / 2) + row_block[0] - 1)) *
                           self._dtype().itemsize)
        self._idwt_row(row_approx_device_array,
                       row_detail_device_array,
                       approx_device_array,
                       numpy.int32(out_rows[0]),
                       numpy.int32(in_cols[0]),
                       numpy.int32(in_cols[0]),
                       numpy.int32(num_slices),
                       block=row_block,
                       grid=row_grid,
                       shared=shared_mem_size)

        for d in range(1, depth):
            col_grid_x = int(in_cols[d] / col_block[0]) + (1 if in_cols[d] %
                                                           col_block[0] else 0)
            col_grid_y = int(in_rows[d] / col_block[1]) + (1 if in_rows[d] %
                                                           col_block[1] else 0)
            col_grid = (col_grid_x, col_grid_y, col_grid_z)
            self._idwt_col(approx_device_array,
                           input_device_list[d + 1][0],
                           input_device_list[d + 1][1],
                           input_device_list[d + 1][2],
                           row_approx_device_array,
                           row_detail_device_array,
                           numpy.int32(out_rows[d - 1]),
                           numpy.int32(out_cols[d - 1]),
                           numpy.int32(in_rows[d]),
                           numpy.int32(in_cols[d]),
                           numpy.int32(num_slices),
                           block=col_block,
                           grid=col_grid,
                           shared=0)

            row_grid_x = int(in_cols[d] / row_block[0]) + (1 if in_cols[d] %
                                                           row_block[0] else 0)
            row_grid_y = int(out_rows[d] / row_block[1]) + (
                1 if out_rows[d] % row_block[1] else 0)
            row_grid = (row_grid_x, row_grid_y, row_grid_z)
            self._idwt_row(row_approx_device_array,
                           row_detail_device_array,
                           approx_device_array,
                           numpy.int32(out_rows[d]),
                           numpy.int32(in_cols[d]),
                           numpy.int32(in_cols[d]),
                           numpy.int32(num_slices),
                           block=row_block,
                           grid=row_grid,
                           shared=shared_mem_size)

        # Get results from device
        approx_array = approx_device_array if gpu_output else approx_device_array.get(
        )
        if len(in_shape) > 2:
            new_shape = list(in_shape[:-2])
            new_shape.append(approx_array.shape[-2])
            new_shape.append(approx_array.shape[-1])
            approx_array = approx_array.reshape(new_shape)

        return approx_array
Exemple #41
0
    def _process_j(self):
        """ Use PyCuda """

        nElements = self.width * self.height
        nBlocks = nElements / BLOCK_SIZE + 1
        #print 'No. elements:', nElements
        #print 'No. blocks:', nBlocks
        grid_dimensions = (nBlocks, 1)
        block_dimensions = (BLOCK_SIZE, 1, 1)

        tilde_im_mapping = self.pycuda_y_tilde_pbo.map()
        tilde_fx_mapping = self.pycuda_y_fx_tilde_pbo.map()
        tilde_fy_mapping = self.pycuda_y_fy_tilde_pbo.map()
        tilde_m_mapping = self.pycuda_y_m_tilde_pbo.map()

        p_tilde_mapping = self.pycuda_yp_tilde_pbo.map()
        p_tilde_fx_mapping = self.pycuda_yp_fx_tilde_pbo.map()
        p_tilde_fy_mapping = self.pycuda_yp_fy_tilde_pbo.map()
        p_tilde_m_mapping = self.pycuda_yp_m_tilde_pbo.map()

        pp_tilde_mapping = self.pycuda_ypp_tilde_pbo.map()
        pp_tilde_fx_mapping = self.pycuda_ypp_fx_tilde_pbo.map()
        pp_tilde_fy_mapping = self.pycuda_ypp_fy_tilde_pbo.map()
        pp_tilde_m_mapping = self.pycuda_ypp_m_tilde_pbo.map()

        partialsum = np.zeros((nBlocks, 1), dtype=np.float32)
        partialsum_gpu = gpuarray.to_gpu(partialsum)
        partialsum_fx = np.zeros((nBlocks, 1), dtype=np.float32)
        partialsum_fx_gpu = gpuarray.to_gpu(partialsum_fx)
        partialsum_fy = np.zeros((nBlocks, 1), dtype=np.float32)
        partialsum_fy_gpu = gpuarray.to_gpu(partialsum_fy)
        partialsum_m = np.zeros((nBlocks, 1), dtype=np.float32)
        partialsum_m_gpu = gpuarray.to_gpu(partialsum_m)

        #CUDA definition
        #__global__ void j(unsigned char *y_im_t, float *y_fx_t, float *y_fy_t, unsigned char *y_m_t,
        #					unsigned char *yp_im_t, float *yp_fx_t, float *yp_fy_t, unsigned char *yp_m_t,
        #					unsigned char *ypp_im_t, float *ypp_fx_t, float *ypp_fy_t, unsigned char *ypp_m_t,
        #					float *output, float *output_fx, float *output_fy, float *output_m,
        #					int len)

        #Make the call...
        self.cuda_j.prepared_call(grid_dimensions, block_dimensions,\
          tilde_im_mapping.device_ptr(),tilde_fx_mapping.device_ptr(),\
          tilde_fy_mapping.device_ptr(),tilde_m_mapping.device_ptr(),\
          p_tilde_mapping.device_ptr(),p_tilde_fx_mapping.device_ptr(),\
          p_tilde_fy_mapping.device_ptr(),p_tilde_m_mapping.device_ptr(),\
          pp_tilde_mapping.device_ptr(),pp_tilde_fx_mapping.device_ptr(),\
          pp_tilde_fy_mapping.device_ptr(),pp_tilde_m_mapping.device_ptr(),\
          partialsum_gpu.gpudata, partialsum_fx_gpu.gpudata,\
          partialsum_fy_gpu.gpudata, partialsum_m_gpu.gpudata,\
          np.uint32(nElements))
        cuda_driver.Context.synchronize()

        tilde_im_mapping.unmap()
        tilde_fx_mapping.unmap()
        tilde_fy_mapping.unmap()
        tilde_m_mapping.unmap()

        p_tilde_mapping.unmap()
        p_tilde_fx_mapping.unmap()
        p_tilde_fy_mapping.unmap()
        p_tilde_m_mapping.unmap()

        pp_tilde_mapping.unmap()
        pp_tilde_fx_mapping.unmap()
        pp_tilde_fy_mapping.unmap()
        pp_tilde_m_mapping.unmap()

        #Read out the answer...
        partialsum = partialsum_gpu.get()
        partialsum_fx = partialsum_fx_gpu.get()
        partialsum_fy = partialsum_fy_gpu.get()
        partialsum_m = partialsum_m_gpu.get()
        sum_gpu = np.sum(partialsum[0:np.ceil(nBlocks / 2.)])
        sum_fx_gpu = np.sum(partialsum_fx[0:np.ceil(nBlocks / 2.)])
        sum_fy_gpu = np.sum(partialsum_fy[0:np.ceil(nBlocks / 2.)])
        sum_m_gpu = np.sum(partialsum_m[0:np.ceil(nBlocks / 2.)])
        scale = 1
        #print 'j (GPU) components'
        #print sum_gpu, sum_fx_gpu/scale/scale, sum_fy_gpu/scale/scale
        return sum_gpu + sum_fx_gpu / scale / scale + sum_fy_gpu / scale / scale + sum_m_gpu
Exemple #42
0
    def idwt1d(self,
               input_list,
               gpu_output=False,
               gpu_allocator=cuda.mem_alloc):
        """
        idwt1d

        Perform a 1D inverse discrete wavelet transform.
        If an array has multiple rows, this function will perform a batched transform.

        Parameters
        ----------
        input_list: list of Numpy arrays or GPUArray objects
            A list of numpy arrays of the DWT coefficients to be reconstructed.

            The coefficients are organized as [c_lo, c_hi_depth, c_hi_depth-1, ... , c_hi_2, c_hi_1].

            The detail row size can be one less than the approximation row size for each level.

            If the arrays are multidimensional, this function will perform a batched 1D inverse DWT
            for all rows of the arrays.  All arrays must have the same dimensions except for the last dimension.

            The rows being the last dimension of the array.
        gpu_output: bool, optional
            If True, then return the reconstructed signal as a GPUArray object. (False is the default)
        gpu_allocator: callable, optional
            Allocator used by GPUArray. (pycuda.driver.mem_alloc is the default)

        Returns
        -------
        Numpy array or GPUArray object
            An array of the reconstructed signal either in float32 or float64,
            depending on whether use_float32 was set to True or False for PycudaWaveletTransform.
        """

        if not isinstance(input_list, list):
            raise TypeError('invalid input_list type')
        if any([
                not isinstance(x, numpy.ndarray)
                and not isinstance(x, gpuarray.GPUArray) for x in input_list
        ]):
            raise TypeError('invalid input_list type')
        if len(input_list) > 1:
            approx_shape = list(input_list[0].shape)
            for d in input_list[1:]:
                detail_shape = list(d.shape)
                if detail_shape[-1] < approx_shape[-1] - 1 or approx_shape[
                        -1] < detail_shape[-1]:
                    raise RuntimeError('arrays have incompatible shapes')
                if len(detail_shape) > 1:
                    if approx_shape[:-1] != detail_shape[:-1]:
                        raise RuntimeError('arrays have incompatible shapes')
                approx_shape[-1] = 2 * detail_shape[-1]

        depth = len(input_list) - 1

        if depth == 0:
            return input_list[0]

        in_shape = input_list[0].shape
        num_rows = int(1)
        if len(in_shape) > 1:
            for i in in_shape[:-1]:
                num_rows *= i

        # If input is in Numpy arrays, then copy it to GPUArray objects.
        input_device_list = []
        for input_array in input_list:
            if isinstance(input_array, numpy.ndarray):
                cont_input_array = numpy.ascontiguousarray(input_array,
                                                           dtype=self._dtype)
                input_device_array = gpuarray.to_gpu(cont_input_array)
            else:
                if input_array.dtype != self._dtype:
                    input_device_array = input_array.astype(self._dtype)
                else:
                    input_device_array = input_array
                if not input_device_array.flags.c_contiguous:
                    input_device_array = input_device_array.reshape(
                        input_array.shape, order='C')
            input_device_list.append(input_device_array)

        # Get size of input_list arrays
        in_sizes = []
        out_sizes = []
        for i in range(depth):
            s = input_list[i + 1].shape[-1]
            in_sizes.append(int(s))
            out_sizes.append(int(2 * s))

        # Allocate device arrays
        approx_device_arrays = []
        for s in out_sizes:
            approx_device_arrays.append(
                gpuarray.GPUArray([num_rows, s],
                                  dtype=self._dtype,
                                  allocator=gpu_allocator))

        # Transform
        block = (256, 1, 1)
        grid_x = int(
            in_sizes[0] / block[0]) + (1 if in_sizes[0] % block[0] else 0)
        grid_y = int(num_rows / block[1]) + (1 if num_rows % block[1] else 0)
        grid = (grid_x, grid_y, 1)
        shared_mem_size = block[2] * block[1] * (2 * (
            int(self._dec_length / 2) + block[0] - 1)) * self._dtype().itemsize
        self._idwt_row(input_device_list[0],
                       input_device_list[1],
                       approx_device_arrays[0],
                       numpy.int32(num_rows),
                       numpy.int32(in_shape[-1]),
                       numpy.int32(in_sizes[0]),
                       numpy.int32(1),
                       block=block,
                       grid=grid,
                       shared=shared_mem_size)

        for d in range(1, depth):
            grid_x = int(
                in_sizes[d] / block[0]) + (1 if in_sizes[d] % block[0] else 0)
            grid = (grid_x, grid_y, 1)
            self._idwt_row(approx_device_arrays[d - 1],
                           input_device_list[d + 1],
                           approx_device_arrays[d],
                           numpy.int32(num_rows),
                           numpy.int32(out_sizes[d - 1]),
                           numpy.int32(in_sizes[d]),
                           numpy.int32(1),
                           block=block,
                           grid=grid,
                           shared=shared_mem_size)

        # Get results from device
        result = approx_device_arrays[
            -1] if gpu_output else approx_device_arrays[-1].get()
        if len(in_shape) > 1:
            new_shape = list(in_shape[:-1])
            new_shape.append(result.shape[-1])
            result = result.reshape(new_shape)

        return result
Exemple #43
0
 def initialize_delays(self, spikedelays):
     self.spikedelay_arr = gpuarray.to_gpu(
         array(rint(spikedelays / self.dt), dtype=int32))
Exemple #44
0
"""
Demonstrates how to extract the lower triangle of a matrix.
"""

import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
import pycuda.gpuarray as gpuarray

import scikits.cuda.linalg as culinalg
import scikits.cuda.misc as cumisc
culinalg.init()

# Double precision is only supported by devices with compute
# capability >= 1.3:
import string
demo_types = [np.float32, np.complex64]
if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3:
    demo_types.extend([np.float64, np.complex128])

for t in demo_types:
    print 'Testing lower triangle extraction for type ' + str(np.dtype(t))
    N = 10
    if np.iscomplexobj(t()):
        a = np.asarray(np.random.rand(N, N), t)
    else:
        a = np.asarray(np.random.rand(N, N)+1j*np.random.rand(N, N), t)
    a_gpu = gpuarray.to_gpu(a)
    b_gpu = culinalg.tril(a_gpu, False)
    print 'Success status: ', np.allclose(b_gpu.get(), np.tril(a))
Exemple #45
0
 def initialize_spikes(self, spiketimes, spiketimes_indices):
     self.spiketimes = gpuarray.to_gpu(
         array(rint(spiketimes / self.dt), dtype=int32))
     self.spiketime_indices = gpuarray.to_gpu(
         array(spiketimes_indices, dtype=int32))
Exemple #46
0
    def trace_gpu(self, ray_array):
        """
        Core method to do ray tracing

        change CPU recursive version -> GPU iterative version

        for every depth, call gpu functions

        Then do stack process to get final result
        """

        stack = []

        ray_array_cpu = np.array(ray_array, np.float32)
        scene_cpu = np.array(self.__scene_gpu, np.float32)
        print(ray_array_cpu.shape)
        print(scene_cpu.shape)

        scene_size = np.int32(scene_cpu.shape[0])

        # loop for every depth
        for i in range(self.__max_recursion_depth):
            num_ray = np.int32(ray_array_cpu.shape[0])

            num_thread = 256
            num_block = int(np.ceil(float(num_ray) / float(num_thread)))
            ray_array_gpu = gpuarray.to_gpu(ray_array_cpu)
            scene_gpu = gpuarray.to_gpu(scene_cpu)

            output_intersect_gpu = gpuarray.empty((num_ray, 6), np.float32)
            output_obj_index_gpu = gpuarray.empty((num_ray, 1), np.int32)
            output_color = gpuarray.empty((num_ray, 3), np.float32)

            # call parallel intersect function
            self.kernel_fun_intersect(ray_array_gpu,
                                      scene_gpu,
                                      num_ray,
                                      scene_size,
                                      output_intersect_gpu,
                                      output_obj_index_gpu,
                                      block=(num_thread, 1, 1),
                                      grid=(num_block, 1, 1))

            intersect_cpu = output_intersect_gpu.get()
            obj_index_cpu = output_obj_index_gpu.get()

            intersect_gpu = gpuarray.to_gpu(intersect_cpu)
            obj_index_gpu = gpuarray.to_gpu(obj_index_cpu)

            # the final depth will treat every objects as diffuse one
            # flag here is used to control this
            if i == self.__max_recursion_depth - 1:
                flag = np.int32(1)
            else:
                flag = np.int32(0)

            # trace diffuse
            self.kernel_fun_trace_diffuse(intersect_gpu,
                                          obj_index_gpu,
                                          num_ray,
                                          scene_gpu,
                                          scene_size,
                                          flag,
                                          output_color,
                                          block=(num_thread, 1, 1),
                                          grid=(num_block, 1, 1))

            color_cpu = output_color.get()

            if flag == 1:
                stack.append([color_cpu])
                break

            intersect_gpu = gpuarray.to_gpu(intersect_cpu)
            obj_index_gpu = gpuarray.to_gpu(obj_index_cpu)
            output_reflect_ray_gpu = gpuarray.empty((num_ray, 7), np.float32)
            output_refract_ray_gpu = gpuarray.empty((num_ray, 7), np.float32)
            output_fresnel_gpu = gpuarray.empty((num_ray, 1), np.float32)

            # trace non diffuse
            self.kernel_fun_trace_non_diffuse(ray_array_gpu,
                                              intersect_gpu,
                                              obj_index_gpu,
                                              num_ray,
                                              scene_gpu,
                                              scene_size,
                                              flag,
                                              output_reflect_ray_gpu,
                                              output_refract_ray_gpu,
                                              output_fresnel_gpu,
                                              block=(num_thread, 1, 1),
                                              grid=(num_block, 1, 1))

            reflect_ray_cpu = output_reflect_ray_gpu.get()
            refract_ray_cpu = output_refract_ray_gpu.get()
            fresnel_cpu = output_fresnel_gpu.get()

            # prepare for next depth interation
            ray_array_cpu_next, to_stack = self.ray_batch_filter(
                reflect_ray_cpu, refract_ray_cpu, fresnel_cpu, obj_index_cpu)

            to_stack.append(color_cpu)

            stack.append(to_stack)

            ray_array_cpu = np.array(ray_array_cpu_next, np.float32)

        color = self.process_stack(stack, scene_gpu)

        return color
Exemple #47
0
 def to_gpu(array):
     return gpuarray.to_gpu(array)
Exemple #48
0
 def initialize_traces(self, traces, traces_offset):
     self.traces = gpuarray.to_gpu(array(traces, dtype=double))
     self.traces_offset = gpuarray.to_gpu(array(traces_offset, dtype=int32))
Exemple #49
0
import skcuda.fft as cu_fft

print 'Testing fft/ifft..'
N = 256
batch_size = 16

x = np.empty((batch_size, N, N), np.float32)
xf = np.empty((batch_size, N, N), np.complex64)
y = np.empty((batch_size, N, N), np.float32)
for i in xrange(batch_size):
    x[i, :, :] = np.asarray(np.random.rand(N, N), np.float32)
    xf[i, :, :] = np.fft.fft2(x[i, :, :])
    y[i, :, :] = np.real(np.fft.ifft2(xf[i, :, :]))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N, N / 2 + 1), np.complex64)
plan_forward = cu_fft.Plan((N, N), np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan((N, N), np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6)

print 'Testing in-place fft..'
x = np.empty((batch_size, N, N), np.complex64)
x_gpu = gpuarray.to_gpu(x)

plan = cu_fft.Plan((N, N), np.complex64, np.complex64, batch_size)
Exemple #50
0
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray

host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
device_data = gpuarray.to_gpu(host_data)
device_data_x2 = 2 * device_data

host_data_x2 = device_data_x2.get()

print(host_data_x2)
Exemple #51
0
def cuda_gridvis(sub_array, f, settings, plan, chan):
    """
    Grid the visibilities parallelized by pixel.
    References:
      - Chapter 10 in "Interferometry and Synthesis in Radio Astronomy"
          by Thompson, Moran, & Swenson
      - Daniel Brigg's PhD Thesis: http://www.aoc.nrao.edu/dissertations/dbriggs/
    """
    print "Gridding the visibilities"
    t_start = time.time()
    if sub_array==1:
        Antennas = 40
    else:
        Antennas = 60

    # unpack parameters
    vfile = settings['vfile']
    briggs = settings['briggs']
    imsize = settings['imsize']
    cell = settings['cell']
    nx = np.int32(2 * imsize)
    noff = np.int32((nx - imsize) / 2)

    ## constants
    arc2rad = np.float32(np.pi / 180. / 3600.)
    du = np.float32(1. / (arc2rad * cell * nx))

    # determin the file type (uvfits or fitsidi)
    h_u = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64')
    h_v = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float64')
    h_re = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32')
    h_im = np.ndarray(shape=(Antennas*(Antennas-1)//2, 1), dtype='float32')

    #Get Visibility Data and values of UVW
    if settings['vfile'].find('.uvfits') != -1:
        freq = 3.45E11 #np.float32(f[0].header['CRVAL4'])
        light_speed = 299792458.
        good = np.where(f[0].data.data[:, 0, 0, chan, 0, 0] != 0)

        h_u = np.float32(light_speed * f[0].data.par('uu')[good])
        print "h_u", h_u.shape
        h_v = np.float32(light_speed * f[0].data.par('vv')[good])
        gcount = np.int32(np.size(h_u))
        ## assume data is unpolarized
        h_re = np.float32(f[0].data.data[good, 0, 0, chan, 0, 0])
        h_im = np.float32(f[0].data.data[good, 0, 0, chan, 0, 1])

        freq = 1702500000.
        light_speed = 299792458.  # Speed of light


        ## assume data is unpolarized
        #print chan
        print 'GCOUNT', gcount

        # h_ : host,  d_ : device
        h_grd = np.zeros((nx, nx), dtype=np.complex64)
        h_cnt = np.zeros((nx, nx), dtype=np.int32)
        d_u = gpu.to_gpu(np.array(h_u,dtype='float32'))
        d_v = gpu.to_gpu(np.array(h_v,dtype='float32'))
        d_re = gpu.to_gpu(np.array(h_re,dtype='float32'))
        d_im = gpu.to_gpu(np.array(h_im,dtype='float32'))
        d_cnt = gpu.zeros((np.int(nx), np.int(nx)), np.int32)
        d_grd = gpu.zeros((np.int(nx), np.int(nx)), np.complex64)
        d_ngrd = gpu.zeros_like(d_grd)
        d_bm = gpu.zeros_like(d_grd)
        d_nbm = gpu.zeros_like(d_grd)
        d_fim = gpu.zeros((np.int(imsize), np.int(imsize)), np.float32)

        ## define kernel parameters
        if imsize == 1024:
            blocksize2D = (8, 16, 1)
            gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
            blocksizeF2D = (16, 16, 1)
            gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
            blocksize1D = (256, 1, 1)
        else:
            blocksize2D = (16, 32, 1)
            gridsize2D = (np.int(np.ceil(1. * nx / blocksize2D[0])), np.int(np.ceil(1. * nx / blocksize2D[1])))
            blocksizeF2D = (32, 32, 1)
            gridsizeF2D = (np.int(np.ceil(1. * imsize / blocksizeF2D[0])), np.int(np.ceil(1. * imsize / blocksizeF2D[1])))
            blocksize1D = (512, 1, 1)

        gridsize1D = (np.int(np.ceil(1. * gcount / blocksize1D[0])), 1)

        # ------------------------
        # make gridding kernels
        # ------------------------
        ## make spheroidal convolution kernel (don't mess with these!)
        width = 6.
        ngcf = 24.
        h_cgf = gcf(ngcf, width)
        ## make grid correction
        h_corr = corrfun(nx, width)
        d_cgf = module.get_global('cgf')[0]
        d_corr = gpu.to_gpu(h_corr)
        cu.memcpy_htod(d_cgf, h_cgf)

        # ------------------------
        # grid it up
        # ------------------------
        d_umax = gpu.max(cumath.fabs(d_u))
        d_vmax = gpu.max(cumath.fabs(d_v))
        umax = np.int32(np.ceil(d_umax.get() / du))
        vmax = np.int32(np.ceil(d_vmax.get() / du))

        ## grid ($$)
        #  This should be improvable via:
        #    - shared memory solution? I tried...
        #    - better coalesced memory access? I tried...
        #    - reorganzing and indexing UV data beforehand?
        #       (i.e. http://www.nvidia.com/docs/IO/47905/ECE757_Project_Report_Gregerson.pdf)
        #    - storing V(u,v) in texture memory?

        # Each pixel in the uv plane goes through the data and check to see whether the pixel is included in the convolution.
        # This kernel also calculates the point spread function and the local sampling
        # from the data (for applying the weights later).
        gridVis_wBM_kernel(d_grd, d_bm, d_cnt, d_u, d_v, d_re, d_im, nx, du, gcount, umax, vmax, \
                           block=blocksize2D, grid=gridsize2D)

        ## apply weights
        wgtGrid_kernel(d_bm, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
        hfac = np.int32(1)
        dblGrid_kernel(d_bm, nx, hfac, block=blocksize2D, grid=gridsize2D)
        shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
        ## normalize

        wgtGrid_kernel(d_grd, d_cnt, briggs, nx, block=blocksize2D, grid=gridsize2D)
        ## Reflect grid about v axis
        hfac = np.int32(-1)
        dblGrid_kernel(d_grd, nx, hfac, block=blocksize2D, grid=gridsize2D)
        ## Shift both
        shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)

        # ------------------------
        # Make the beam
        # ------------------------
        ## Transform to image plane
        fft.fft(d_nbm, d_bm, plan)
        ## Shift
        shiftGrid_kernel(d_bm, d_nbm, nx, block=blocksize2D, grid=gridsize2D)
        ## Correct for C
        corrGrid_kernel(d_nbm, d_corr, nx, block=blocksize2D, grid=gridsize2D)
        # Trim
        trimIm_kernel(d_nbm, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Normalize
        d_bmax = gpu.max(d_fim)
        bmax = d_bmax.get()
        bmax = np.float32(1. / bmax)
        nrmBeam_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Pull onto CPU
        dpsf = d_fim.get()

        # ------------------------
        # Make the map
        # ------------------------
        ## Transform to image plane
        fft.fft(d_ngrd, d_grd, plan)
        ## Shift
        shiftGrid_kernel(d_grd, d_ngrd, nx, block=blocksize2D, grid=gridsize2D)
        ## Correct for C
        corrGrid_kernel(d_ngrd, d_corr, nx, block=blocksize2D, grid=gridsize2D)
        ## Trim
        trimIm_kernel(d_ngrd, d_fim, noff, nx, imsize, block=blocksizeF2D, grid=gridsizeF2D)
        ## Normalize (Jy/beam)
        nrmGrid_kernel(d_fim, bmax, imsize, block=blocksizeF2D, grid=gridsizeF2D)

        ## Finish timers
        t_end = time.time()
        t_full = t_end - t_start
        print "Gridding execution time %0.5f" % t_full + ' s'
        print "\t%0.5f" % (t_full / gcount) + ' s per visibility'

        ## Return dirty psf (CPU) and dirty image (GPU)
        return dpsf, d_fim
Exemple #52
0
    def bench(self, data, plot=True, get=False):
        """ Function to test drift rate search
			note this bench mark contains compilation times and multiple kernel calls. 
			larger input would be faster. 
		"""
        start = cu.Event()
        copy_htod = cu.Event()
        compute = cu.Event()
        stop = cu.Event()

        start.record()
        self.spectr_d = gpuarray.to_gpu(data)
        copy_htod.record()
        copy_htod.synchronize()
        self.sweep_kernel(self.spectr_d,
                          self.output_d,
                          self.delay_table_d,
                          self.nfreqs,
                          self.ntimes,
                          self.ndelays,
                          block=self.block_size,
                          grid=self.grid_size)
        compute.record()
        compute.synchronize()

        if get:
            out = self.output_d.get()
        else:
            operand = self.output_d[self.ndelays // 2]
            mean = gpuarray.sum(operand / np.float32(self.nfreqs)).get()
            var = gpuarray.sum(
                (operand - mean) * (operand - mean) / np.float32(self.nfreqs))
            std = np.sqrt(var.get())
            self.threshold_kernel(self.output_d,
                                  self.mask_d,
                                  np.float32(3 * std + mean),
                                  self.nfreqs,
                                  self.ndelays,
                                  block=self.block_size,
                                  grid=self.grid_size)
            out = (self.output_d * self.mask_d).get()

        stop.record()
        stop.synchronize()

        print "{} seconds to load data".format(
            start.time_till(copy_htod) * 1.e-3)
        print "{} seconds to compute {} channels, with {} delays".format(
            copy_htod.time_till(compute) * 1.e-3, self.nfreqs, self.ndelays)
        print copy_htod.time_till(compute) / (
            self.nfreqs *
            self.ndelays) * 1.e6, "nanoseconds per channel, per delay"
        if plot:
            import pylab as plt
            f, axes = plt.subplots(2, 1)
            axes[0].imshow(
                data,
                aspect='auto',
                extent=[0, self.nfreqs, 0, self.ntimes * self.delta_t],
                origin='lower')
            axes[0].set_xlabel('Freq [Hz]')
            axes[0].set_ylabel('Time [s]')
            axes[1].imshow(out,
                           aspect='auto',
                           extent=[0, self.nfreqs, -0.1, 0.1],
                           origin='lower')
            axes[1].set_xlabel('Freq [Hz]')
            axes[1].set_ylabel('Drift [Hz/s]')
            plt.tight_layout()
            plt.show()
        return out
Exemple #53
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n / 2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))),
                         dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n / 2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block, 1, 1),
                                    grid=(nblocks_this_iter, 1))

            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (
                i * 2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192 * 50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      Mapped(flag),
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter,
                                            skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index + 1),
                       np.uint32(better_i),
                       nodes,
                       block=(1, 1, 1),
                       grid=(1, 1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block, 1, 1),
                            grid=(nblocks_this_iter, 1))

    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
Exemple #54
0
def run(xHfile=None, IO_DIR=None):

    if IO_DIR is None:
        IO_DIR = parent_folder
    if not os.path.exists(IO_DIR + "/Outfiles"):
        os.makedirs(IO_DIR + "/Outfiles")
    if xHfile is None:
        xHfile = find_files(IO_DIR + "/Boxes/",
                            pattern="xH*{0:06.2f}_{1:i}_{2:.0f}*".format(
                                Z, HII_DIM, BOX_LEN))[0]
    if xHfile.endswith('.npy'):
        xH = np.load(xHfile)
        p_dict = boxio.parse_filename(os.path.splitext(xHfile)[0])
    else:
        b = boxio.readbox(xHfile)
        xH = b.box_data
        p_dict = b.param_dict
    Z = p_dict['z']
    #growth_factor = pb.fgrowth(Z, COSMO['omega_M_0'], unnormed=True)
    #overwrite global variables
    HII_DIM = p_dict['dim']
    BOX_LEN = np.float32(p_dict['BoxSize'])
    DELTA_K = np.float32(2 * np.pi / BOX_LEN)
    VOLUME = (BOX_LEN * BOX_LEN * BOX_LEN)
    HII_TOT_NUM_PIXELS = HII_DIM**3
    try:
        deltax = np.load(
            IO_DIR +
            "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc.npy".
            format(Z, HII_DIM, BOX_LEN))
    except:
        #deltax = boxio.readbox(IO_DIR+"/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(Z, HII_DIM, BOX_LEN)).box_data
        deltax = boxio.readbox(
            IO_DIR +
            "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc".format(
                Z, HII_DIM *
                2, BOX_LEN)).box_data[:HII_DIM, :HII_DIM, :HII_DIM]

    kernel_source = open(cmd_folder + "/delta_T.cu").read()
    kernel_code = kernel_source % {
        'DELTAK': DELTA_K,
        'VOLUME': VOLUME,
        'NUM_BINS': NUM_BINS
    }
    main_module = nvcc.SourceModule(kernel_code)
    pbox_kernel = main_module.get_function("pbox_kernel")
    #pixel_deltax_d = gpuarray.to_gpu(deltax)
    #pixel_xH_d = gpuarray.to_gpu(xH)

    _const_factor = np.float32(
        27 * (COSMO['omega_b_0'] * COSMO['h'] * COSMO['h'] / 0.023) * np.sqrt(
            (0.15 / COSMO['omega_M_0'] / COSMO['h'] / COSMO['h']) *
            (1 + Z) / 10.0))
    delta_T = np.float32(_const_factor * xH * (1.0 + deltax))  #in mK
    ave = np.mean(delta_T)
    np.save(
        IO_DIR +
        "/Boxes/delta_T_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc.npy"
        .format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN,
                ave, HII_DIM, int(BOX_LEN)), delta_T)

    deldel_T = (delta_T / ave - 1) * VOLUME / HII_TOT_NUM_PIXELS
    if DIMENSIONAL_T_POWER_SPEC:
        deldel_T *= ave

    plan = Plan(HII_shape, dtype=np.complex64)
    deldel_T_d = gpuarray.to_gpu(deldel_T.astype(np.complex64))
    plan.execute(deldel_T_d)
    K = np.float32(
        np.logspace(np.log10(DELTA_K), np.log10(DELTA_K * np.sqrt(3.) * DIM),
                    NUM_BINS))
    K_d = gpuarray.to_gpu(K)
    k_ave_d = gpuarray.zeros_like(K_d)
    in_bin_ct_d = gpuarray.zeros_like(K_d)
    ps_d = gpuarray.zeros_like(K_d)

    pbox_kernel(deldel_T_d,
                DIM,
                ps_d,
                k_ave_d,
                in_bin_ct_d,
                K_d,
                block=block_size,
                grid=HII_grid_size)
    ps = ps_d.get()
    in_bin_ct = in_bin_ct_d.get()
    k_ave = k_ave_d.get()
    k_ave = np.where(in_bin_ct > 0, k_ave / in_bin_ct, 0.)
    ps_ave = np.where(in_bin_ct > 0, ps / in_bin_ct, 0.)
    #ps_fname = "/ps_nov_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc".format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, np.int32(BOX_LEN))
    #np.savez(IO_DIR+ps_fname, k_ave=k_ave, ps_ave=ps_ave)

    return K, k_ave, ps_ave
	__syncthreads();
    }
"""

# Initialize kernel
smoothing_kernel = nvcc.SourceModule(kernel_smooth_source).get_function("smoothingFilter")
normalize_kernel = nvcc.SourceModule(kernel_norm_source).get_function("normalizeFilter")
out_kernel = nvcc.SourceModule(kernel_out_source).get_function("outFilter")



# Allocate memory and constants
smem_size   = int(TPBx*TPBy*4)

Copy arrays to device once
IMG_device          = gpuarray.to_gpu(IMG)
BOX_device          = gpuarray.to_gpu(BOX)
NORM_device         = gpuarray.to_gpu(NORM)
OUT_device          = gpuarray.to_gpu(OUT)

setup_stop_time = time.time()
smth_kernel_start_time   = cu.Event()
smth_kernel_stop_time    = cu.Event()
norm_kernel_start_time   = cu.Event()
norm_kernel_stop_time    = cu.Event()
out_kernel_start_time    = cu.Event()
out_kernel_stop_time     = cu.Event()

##########
# The kernel will convolve the image with a gaussian weighted sum
# determine the BOX size that allows the sum to reach either the maxRad or 
Exemple #56
0
    imsize = settings['imsize']

    # nx - 2 imsize, it means 2048 when imsize=1024
    nx = np.int32(2 * imsize)
    # create fft plan nx*nx
    plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64, np.complex64)

    f = pyfits.open(settings['vfile'])
    channel = f[0].data.data.shape[3]  # Read channel from uvfits file

    for chan in range(0, channel):
        #dpsf, gpu_im = cuda_gridvis(f, settings, plan, chan)
        dpsf, gpu_im = cuda_gridvis(sub_array, f, settings, plan, chan)

        gpu_dpsf = gpu.to_gpu(dpsf)
        print "**********************************************"

        if PLOTME:
            dirty = np.roll(np.fliplr(gpu_im.get()), 1, axis=1)

        ## Clean the PSF
        if imsize >= 1024:
            cpsf = serial_clean_beam(dpsf, imsize / 50.)
        elif imsize >= 512:
            cpsf = serial_clean_beam(dpsf, imsize / 25.)
        elif imsize >= 256:
            cpsf = serial_clean_beam(dpsf, imsize / 12.)

        gpu_cpsf = gpu.to_gpu(cpsf)
Exemple #57
0
 def ifft(self, ff_fft):
     xf_gpu = gpuarray.to_gpu(ff_fft)
     x_gpu = gpuarray.empty(self.shapeX, np.float32)
     skfft.ifft(xf_gpu, x_gpu, self.ifftplan, False)
     return x_gpu.get()
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
{

    int row = blockIdx.x*blockDim.x + threadIdx.x;
    int col = blockIdx.y*blockDim.y + threadIdx.y;

	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);

}
''')

matrix_ker = ker.get_function('matrix_mult_ker')

test_a = np.float32([xrange(1, 5)] * 4)
test_b = np.float32([xrange(14, 10, -1)] * 4)

output_mat = np.matmul(test_a, test_b)

test_a_gpu = gpuarray.to_gpu(test_a)
test_b_gpu = gpuarray.to_gpu(test_b)
output_mat_gpu = gpuarray.empty_like(test_a_gpu)

matrix_ker(test_a_gpu,
           test_b_gpu,
           output_mat_gpu,
           np.int32(4),
           block=(2, 2, 1),
           grid=(2, 2, 1))

assert (np.allclose(output_mat_gpu.get(), output_mat))
Exemple #59
0
 def fft(self, ff):
     x_gpu = gpuarray.to_gpu(ff)
     xf_gpu = gpuarray.empty(self.shapeK, np.complex64)
     skfft.fft(x_gpu, xf_gpu, self.fftplan, False)
     return xf_gpu.get()
count_facs = SourceModule("""
    __global__ void count_facs(int x, unsigned int *y, int *z) {
        if (x % y[threadIdx.x]) {
            z[0] += 1;
        }
    }
""").get_function("count_facs")
'''

limit = 5000  # limit = int(input('Limit: '))
start_time = time()
with open('primes1.txt') as f:
    primes = np.fromiter(map(int,
                             f.read().strip().split(',')),
                         dtype=np.uint32)
p_gpu = gpuarray.to_gpu(primes)

antiprimes = []
most = 0
for x in gpuarray.arange(2, limit + 1, dtype=np.uint32):
    x_local = x.get()
    fac = [[], []]
    x_temp = x.copy()
    for prime in p_gpu:
        more = cumath.fmod(x_temp, prime).get() == np.uintc(0)
        more_app = more
        if more:
            power = 0
        while more:
            power += 1
            x_temp = x_temp / prime