Exemple #1
0
 def execute(self):
     sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8)
     while (sender_ready == 0):
         sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8)
     drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf,
                     self.tensor.tensor.size * self.op.dtype.itemsize)
     drv.memset_d8(self.sender_ready, 0, 1)
Exemple #2
0
def setTotalDensity(data_dir, mf, global_vars):
    global RhoField, PhaseField, xSize
    blockX, blockY, blockZ = global_vars["blockX"], global_vars[
        "blockY"], global_vars["blockZ"]
    gridX, gridY, gridZ = global_vars["gridX"] * global_vars[
        "num_GPUs"], global_vars["gridY"], global_vars["gridZ"]
    QuantumState = np.load(data_dir)
    xSize, ySize, zSize = QuantumState.shape[0], QuantumState.shape[
        1], QuantumState.shape[2]
    Lattice[0], Lattice[1], Lattice[2], Lattice[3] = xSize, ySize, zSize, mf
    RhoField = np.zeros((xSize, ySize), dtype=DTYPE)
    PhaseField = np.zeros((xSize, ySize), dtype=np.float64)
    gpuQuantumState = drv.to_device(QuantumState)
    gpuPhaseField = drv.to_device(PhaseField)
    gpuRhoField = drv.to_device(RhoField)
    gpuLattice = drv.to_device(Lattice)
    getTotalDensity(gpuQuantumState,
                    gpuRhoField,
                    gpuPhaseField,
                    gpuLattice,
                    block=(blockX, blockY, blockZ),
                    grid=(gridX, gridY))
    RhoField = drv.from_device(gpuRhoField, RhoField.shape, DTYPE)
    PhaseField = drv.from_device(gpuPhaseField, PhaseField.shape, np.float64)
    gpuQuantumState.free()
    gpuPhaseField.free()
    gpuRhoField.free()
    gpuLattice.free()
Exemple #3
0
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    MV = numpy.zeros(len(XK))
    L = numpy.sqrt(2*surface.Area) # Representative length
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w    = getWeights(K)
    X_V = numpy.zeros(N*K)
    X_Kx = numpy.zeros(N*K)
    X_Ky = numpy.zeros(N*K)
    X_Kz = numpy.zeros(N*K)
    X_Kc = numpy.zeros(N*K)
    X_Vc = numpy.zeros(N*K)

    for i in range(N*K):
        X_V[i]   = XV[i/K]*w[i%K]*surface.Area[i/K]
        X_Kx[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
        X_Ky[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
        X_Kz[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
        X_Kc[i]  = XK[i/K]
        X_Vc[i]  = XV[i/K]

    toc = time.time()
    time_set = toc - tic
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu   = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu  = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu  = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu  = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))


    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))

    get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, 
            surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, 
            numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
             AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
def make_tracks(fig, file_name):
    global rhoMin, rhoMax
    quantum_state = np.load(file_name)
    VortField = np.zeros((xSize, ySize, spinComps), dtype=DTYPE)
    VxField = np.zeros((xSize, ySize), dtype=DTYPE)
    VyField = np.zeros((xSize, ySize), dtype=DTYPE)
    PhaseField = np.zeros((xSize, ySize), dtype=np.float64)
    VFieldAverage = np.zeros((xSize, ySize), dtype=DTYPE)
    RhoField = np.zeros((xSize, ySize), dtype=DTYPE)
    boson_field = np.zeros((xSize, ySize, spinComps), dtype=DTYPE)
    Lattice = np.zeros(4, dtype=np.int_)
    Lattice[0], Lattice[1], Lattice[2] = xSize, ySize, zSize
    gpuQField = drv.to_device(quantum_state)
    quantum_state = drv.from_device(gpuQField, quantum_state.shape, DTYPE)
    gpuVField = drv.to_device(VortField)
    gpuVxField = drv.to_device(VxField)
    gpuVyField = drv.to_device(VyField)
    gpuPhaseField = drv.to_device(PhaseField)
    gpuVFieldAverage = drv.to_device(VFieldAverage)
    gpuRhoField = drv.to_device(RhoField)
    gpuBosonField = drv.to_device(boson_field)
    gpuLattice = drv.to_device(Lattice)
    getPlotDetailsVorticity(gpuQField,
                            gpuVField,
                            gpuRhoField,
                            gpuBosonField,
                            gpuPhaseField,
                            gpuVxField,
                            gpuVyField,
                            gpuLattice,
                            block=(blockX, blockY, blockZ),
                            grid=(gridX, gridY))
    VortField = drv.from_device(gpuVField, VortField.shape, DTYPE)
    boson_field = drv.from_device(gpuBosonField, boson_field.shape, DTYPE)
    #vortex_centers = find_dark_vortex_from_boson(boson_field)
    vortex_centers = find_dark_vortex_from_vorticity(VortField)
    x = vortex_centers[:, 0]
    y = vortex_centers[:, 1]
    colors = [color_code[c] for c in vortex_centers[:, 2]]
    plt.subplot(111)
    plt.scatter(x, y, c=colors, alpha=.25, s=1)
    #putLabels('', r'$y\ \ (\ell)$', r'$\rho \ \ (\frac{1}{\ell^2})$')
    ax = plt.gca()
    ax.set_aspect('equal')
    ax.set_ylim(0, ySize)
    ax.set_xlim(0, xSize)
    # Screen density    fig.tight_layout(pad=0.4, w_pad=5.0, h_pad=1.0, rect = [.05, .05, .95, .95])

    #Free GPU memory
    gpuQField.free()
    gpuVField.free()
    gpuVxField.free()
    gpuVyField.free()
    gpuPhaseField.free()
    gpuVFieldAverage.free()
    gpuRhoField.free()
    gpuBosonField.free()
    gpuLattice.free()
    return fig
Exemple #5
0
 def execute(self):
     for i in range(len(self.op.from_id)):
         sender_ready = drv.from_device(self.sender_ready[i], (1, ),
                                        np.int8)
         while (sender_ready == 0):
             sender_ready = drv.from_device(self.sender_ready[i], (1, ),
                                            np.int8)
         drv.memset_d8(self.sender_ready[i], 0, 1)
def exchange(nx, ny, a_gpu, b_gpu):
    nof = np.nbytes['float32']  # nbytes of float
    cuda.memcpy_htod(
        int(b_gpu),
        cuda.from_device(int(a_gpu) + (nx - 2) * ny * nof, (ny, ), np.float32))
    cuda.memcpy_htod_async(
        int(a_gpu) + (nx - 1) * ny * nof,
        cuda.from_device(int(b_gpu) + ny * nof, (ny, ), np.float32))
def send(target, nx, ny, nz, fx_gpu, fy_gpu):
	if target < myrank: 
		offset_fx = int(fx_gpu)
		offset_fy = int(fy_gpu)
	else: 
		offset_fx = int(fx_gpu) + nx*ny*(nz-1)*nof
		offset_fy = int(fy_gpu) + nx*ny*(nz-1)*nof

	mpi.world.send(target, 0, cuda.from_device(offset_fx, (nx,ny), np.float32))
	mpi.world.send(target, 1, cuda.from_device(offset_fy, (nx,ny), np.float32))
Exemple #8
0
def get_heartbeat(d_lead, length, sampling_rate):
    # Kernel Parameters
    threads_per_block = 200
    num_blocks = length / threads_per_block

    # Get RR
    reduce_by = 32
    edge_signal = cuda.mem_alloc(4 * length)

    edge_detect(edge_signal,
                d_lead,
                grid=(num_blocks, 1),
                block=(threads_per_block, 1, 1))

    indecies = numpy.zeros(length / reduce_by).astype(numpy.int32)
    masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32))
    d_index = cuda.to_device(indecies)
    index_of_peak(d_index,
                  masks,
                  edge_signal,
                  grid=(num_blocks, 1),
                  block=(threads_per_block, 1, 1))

    cd_index, c_length = compact_sparse_with_mask(d_index, masks,
                                                  length / reduce_by)

    # Allocate output
    # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32)
    dev_rr = cuda.mem_alloc(c_length * 4)

    num_blocks = (c_length / threads_per_block) + 1
    get_compact_rr(dev_rr,
                   cd_index,
                   numpy.int32(sampling_rate),
                   numpy.int32(c_length),
                   grid=(num_blocks, 1),
                   block=(threads_per_block, 1, 1))

    clean_result(dev_rr,
                 numpy.int32(120),
                 numpy.int32(40),
                 numpy.int32(1),
                 numpy.int32(c_length),
                 grid=(num_blocks, 1),
                 block=(threads_per_block, 1, 1))

    moving_average_filter(dev_rr, c_length, 250)

    index = cuda.from_device(cd_index, (c_length, ), numpy.int32)
    rr = cuda.from_device(dev_rr, (c_length, ), numpy.int32)
    index[0] = index[1]

    return rr, index / float(sampling_rate * 3600)
Exemple #9
0
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
	ctx1 = cuda.Device(dev1).make_context()
	a = cuda.from_device(int(a_gpu)+(nx-2)*ny*nof, (ny,), np.float32)
	ctx1.pop()

	ctx2 = cuda.Device(dev2).make_context()
	cuda.memcpy_htod(int(b_gpu), a)
	b = cuda.from_device(int(b_gpu)+ny*nof, (ny,), np.float32)
	ctx2.pop()

	ctx1 = cuda.Device(dev1).make_context()
	cuda.memcpy_htod_async(int(a_gpu)+(nx-1)*ny*nof, b)
	ctx1.pop()
Exemple #10
0
def exchange(nx, ny, a_gpu, b_gpu, dev1, dev2):
    ctx1 = cuda.Device(dev1).make_context()
    a = cuda.from_device(int(a_gpu) + (nx - 2) * ny * nof, (ny, ), np.float32)
    ctx1.pop()

    ctx2 = cuda.Device(dev2).make_context()
    cuda.memcpy_htod(int(b_gpu), a)
    b = cuda.from_device(int(b_gpu) + ny * nof, (ny, ), np.float32)
    ctx2.pop()

    ctx1 = cuda.Device(dev1).make_context()
    cuda.memcpy_htod_async(int(a_gpu) + (nx - 1) * ny * nof, b)
    ctx1.pop()
 def send(s, rank, tag_mark, direction):
     if direction == 'f': offset_gpu = int(s.arr_gpu) + s.ny * nof
     elif direction == 'b':
         offset_gpu = int(s.arr_gpu) + (s.nx - 2) * s.ny * nof
     print type(offset_gpu)
     mpi.world.send(rank, tag_mark,
                    cuda.from_device(offset_gpu, (s.ny, ), s.dtype))
Exemple #12
0
  def evaluate(self, params, returnOutputs=False):
    """Evaluate several networks (with given params) on training set.
    
    @param params: network params
    @type params: list of Parameters
    @param returnOutputs: return network output values (debug)
    @type returnOutputs: bool, default False
    
    @return output matrix if returnOutputs=True, else None
    """
    if self.popSize != len(params):
      raise ValueError("Need %d Parameter structures (provided %d)" % (
        self.popSize, len(params)))
    
    paramArrayType = Parameters * len(params)
    driver.memcpy_htod(self.params, paramArrayType(*params))

    # TODO: remove
    driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
    
    self.evaluateKernel.prepared_call(self.evaluateGridDim,
                                      self.trainSetDev,
                                      self.trainSet.size,
                                      self.params,
                                      self.popSize,
                                      self.outputs)

    driver.Context.synchronize()

    self.outputsMat = driver.from_device(self.outputs,
                                         shape=(self.popSize, self.trainSet.size),
                                         dtype=np.float32)
    
    if returnOutputs:
      return self.outputsMat
Exemple #13
0
 def pullVort(self):
     self.context.push()
     Vort = drv.from_device(self.QField,
                            (self.xSize, self.ySize, self.zSize, 10),
                            dtype=np.int_)
     self.context.pop()
     return Vort
Exemple #14
0
        def test_stub(shift, trials=10, rounds=1):
            # Run once so that evt_a doesn't include initialization time
            sorter.multisort(dout_a, dout_b, dkeys, count, shift,
                             rounds, stream=stream)
            evt_a = cuda.Event().record(stream)
            for i in range(trials):
                buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift,
                             rounds, stream=stream)
            evt_b = cuda.Event().record(stream)
            evt_b.synchronize()
            dur = evt_b.time_since(evt_a) / (rounds * trials)
            print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000),
                    count * sorter.radix_bits / (dur * 32 * 1000))

            if shift == 0 and correctness:
                print '\nTesting correctness'
                out = cuda.from_device(buf, (count,), np.uint32)
                sort = np.sort(keys)
                if np.all(out == sort):
                    print 'Correct'
                else:
                    nz = np.nonzero(out != sort)[0]
                    print sorted(set(nz >> 13))
                    for i in nz:
                        print i, out[i-1:i+2], sort[i-1:i+2]
                    assert False, 'Oh no'
Exemple #15
0
    def get_from_device(self, index_list=None):
        '''
        Copy array data from GPU device and wrap in a numpy arrays.

        If index_list is None, return list of numpy arrays (one/array).
        If index_list is a single integer, return single numpy array.
        If index_list is an iterable, list of numpy arrays
                (one/selected array).
        '''
        single = False
        if index_list is None:
            index_list = range(len(self.data))
        else:
            try:
                int(index_list)
                index_list = [index_list]
                single = True
            except TypeError:
                pass
        results = []
        try:
            for i in index_list:
                results.append(cuda.from_device(self.data[i], self.shapes[i],
                            self.dtypes[i]))
        except cuda.LaunchError:
            import traceback
            traceback.print_exc()
            traceback.print_stack()
            raise ValueError, 'Invalid device pointer: %d' % i
        if single:
            return results[0]
        else:
            return results
 def send(s, rank, tag_mark, direction):
     if direction == "f":
         offset_gpu = int(s.arr_gpu) + s.ny * nof
     elif direction == "b":
         offset_gpu = int(s.arr_gpu) + (s.nx - 2) * s.ny * nof
     print type(offset_gpu)
     comm.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny,), s.dtype))
    def train_gpu(self, num_iter, model_file_path):
        if self.batch == 0:
            # Prepare to send the numpy array to gpu
            self.syn1_gpu = cuda.to_device(self.syn1)
            # Create word idx and related data-structure.
            self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
            word_rep_ptr = int(self.base_word_rep)
            self.word_reps = {}
            for w_idx, word in sorted(self.dictionary.items()):
                word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
                word_point = self.words_rep[word][1].astype(dtype=np.int32)
                self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
                word_rep_ptr += WordRep.memsize
            print "GPU transfers done."


        self.sent_reps_gpu = cuda.to_device(self.sent_reps)
        # Prepare sentences for GPU transfer.
        idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
                         for sentence in self.sentences]

        # Prepare the kernel function
        kernel = self.kernel_str.get_function("train_sg")
        words = np.empty(self.num_sents, dtype=np.int32)
        # sent_reps = np.copy(self.sent_reps)
        for iter in range(num_iter):
            # Sample words for each sentence and transfer to GPU
            for s_idx in range(self.num_sents):
                words[s_idx] = random.choice(idx_sentences[s_idx])
            words_gpu = cuda.to_device(words)
            kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
                   block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
            # autoinit.context.synchronize()
        self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
        pickle_dump(self.sent_reps, model_file_path)
def print_arr_gpus(s):
	s.send_result()
	if mpi.rank == 0: 
		result = cuda.from_device(s.arr_gpu,s.shape,s.dtype)
		for i in range(1,ngpu): 
			result = np.concatenate((result,mpi.world.recv(i,10)))
		for i in xrange(s.ny):
			print result[:s.nx,i],'\t',result[s.nx:2*s.nx,i],'\t',result[2*s.nx:,i]
def print_arr_gpus(s):
	s.send_result()
	if mpi.rank == 0: 
		result = cuda.from_device(s.arr_gpu,s.shape,s.dtype)
		for i in range(1,ngpu): 
			result = np.concatenate((result,mpi.world.recv(i,10)))
		for i in xrange(s.ny):
			print result[:s.nx,i],'\t',result[s.nx:2*s.nx,i],'\t',result[2*s.nx:,i]
Exemple #20
0
def P2P_gpu(surfSrc, surfTar, m, mx, my, mz, mKc, mVc, K_gpu, V_gpu, 
            surf, LorY, K_diag, IorE, L, w, param, timing, kernel):

    tic = cuda.Event() 
    toc = cuda.Event() 

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mxDev  = cuda.to_device(mx.astype(REAL))
    myDev  = cuda.to_device(my.astype(REAL))
    mzDev  = cuda.to_device(mz.astype(REAL))
    mKcDev = cuda.to_device(mKc.astype(REAL))
    mVcDev = cuda.to_device(mVc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    direct_gpu = kernel.get_function("P2P")
    AI_int = cuda.to_device(zeros(param.Nround, dtype=int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays

    # Check if internal or external to send correct singular integral
    if IorE==1:
        sglInt = surfSrc.sglInt_intDev
    else:
        sglInt = surfSrc.sglInt_extDev


    direct_gpu(K_gpu, V_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mxDev, myDev, mzDev, 
                mKcDev, mVcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, sglInt,
                surfSrc.vertexDev, int32(ptr_offset), int32(ptr_list), 
                int32(LorY), REAL(param.kappa), REAL(param.threshold),
                int32(param.BlocksPerTwig), int32(param.NCRIT), REAL(K_diag), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = zeros(param.Nround, dtype=int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return K_gpu, V_gpu
def print_arr_gpus(ngpu, nx, ny, a_gpu):
	send_result(nx, ny, a_gpu)
	if mpi.rank == 0: 
		result = cuda.from_device(a_gpu, (nx,ny), 'float32')
		print ngpu
		for i in range(1,ngpu): 
			result = np.concatenate((result, mpi.world.recv(i,10)))
		for i in xrange(ny):
			print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]
Exemple #22
0
def print_arr_gpus(ngpu, nx, ny, a_gpu):
    send_result(nx, ny, a_gpu)
    if mpi.rank == 0:
        result = cuda.from_device(a_gpu, (nx, ny), 'float32')
        print ngpu
        for i in range(1, ngpu):
            result = np.concatenate((result, mpi.world.recv(i, 10)))
        for i in xrange(ny):
            print result[:nx, i], '\t', result[nx:2 * nx,
                                               i], '\t', result[2 * nx:, i]
def saveVorticityGPU():
  num_GPUs, directory_name = global_vars["num_GPUs"], global_vars["base_directory_name"]
  vorticity = drv.from_device(gpu[0].gpuVortField, gpu[0].vortField.shape, dtype = np.int_)
  for i in xrange(1,global_vars["num_GPUs"]):
    vortI = gpu[i].pullVort()
    vorticity = np.concatenate((vorticity,vortI))
  direct = directory_name + "/Extra/"
  if not os.path.exists(direct):
      os.makedirs(direct)
  np.save(direct+"vorticity.npy", vorticity)
Exemple #24
0
def get_heartbeat(d_lead, length, sampling_rate):
    # Kernel Parameters
    threads_per_block = 200
    num_blocks = length / threads_per_block


    # Get RR
    reduce_by = 32
    edge_signal = cuda.mem_alloc(4 * length)
    
    edge_detect(edge_signal, d_lead,
                grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    indecies = numpy.zeros(length / reduce_by).astype(numpy.int32)
    masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32))
    d_index = cuda.to_device(indecies)
    index_of_peak(d_index, masks, edge_signal,
                  grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by)

    # Allocate output
    # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32)
    dev_rr = cuda.mem_alloc(c_length * 4)

    num_blocks = (c_length / threads_per_block) + 1
    get_compact_rr(dev_rr,
                   cd_index,
                   numpy.int32(sampling_rate),
                   numpy.int32(c_length),
                   grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    clean_result(dev_rr, numpy.int32(120), numpy.int32(40),
                 numpy.int32(1), numpy.int32(c_length),
                 grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    moving_average_filter(dev_rr, c_length, 250)

    index = cuda.from_device(cd_index, (c_length,), numpy.int32)
    rr = cuda.from_device(dev_rr, (c_length,), numpy.int32)
    index[0] = index[1]

    return rr, index / float(sampling_rate * 3600)
Exemple #25
0
  def lift(self, n):
    """Returns (positive rate within n largest) / (overall positive rate) for
       each individual.
    
    @return list of counts, in order of individuals
    """
    self.countKernel.prepared_call(self.countGridDim,
                                   self.outputs,
                                   self.trainSet.size,
                                   len(self.trainSet.positives),
                                   self.popSize,
                                   self.thresholds,
                                   self.counts)
    
    driver.Context.synchronize()

    countsMat = driver.from_device(self.counts,
                                   shape=(self.popSize, self.countBlockDim[0]),
                                   dtype=np.uint32)
    #log.debug("counts %r: %s", countsMat.shape, str(countsMat))
    log.debug("count sum over threads: %s", str(countsMat.sum(axis=1)))
    
    self.countSums = countsMat.sum(axis=1)
    
    self.nlargestPositiveRate = np.float32(self.countSums) / n
    log.debug("positive rate (n largest outputs): %s", str(self.nlargestPositiveRate))
    
    overallPositiveRate = float(len(self.trainSet.positives)) / float(self.trainSet.size)
    log.debug("positive rate (overall): %.04f", overallPositiveRate)
    
    lifts = self.nlargestPositiveRate / overallPositiveRate
    
    sortedLifts = sorted(enumerate(lifts), key=lambda (i, l): l, reverse=True)
    topIndex, topLift = sortedLifts[0]
    
    topOutputs = self.outputsMat[topIndex]
    
    nans = np.sum(np.isnan(topOutputs))
    neginfs = np.sum(np.isneginf(topOutputs))
    posinfs = np.sum(np.isposinf(topOutputs))
    omin = np.nanmin(topOutputs)
    omax = np.nanmax(topOutputs)
    threshold = self.thresholdsMat[topIndex]
    
    """
    log.info("The top ANN's outputs are:")
    log.info(
      "  %.02f%% NaN, %.02f%% -inf, %.02f%% +inf, min %.02e, max %.02e, thresh %.02e",
      100.0 * nans / len(topOutputs),
      100.0 * neginfs / len(topOutputs),
      100.0 * posinfs / len(topOutputs),
      omin, omax, threshold)
    """
    
    return lifts
Exemple #26
0
 def get(self):
     """
     Returns
     -------
     numpy.array
         nx5 of d=1 simplices
         containing: [index1, index2, dist, sigma1, sigma2] with sigma 1 < sigma 2
     """
     self.result = drv.from_device(self.k_simplices_ptr,
                                   self.k_simplices.shape, np.float32)
     return self.result
Exemple #27
0
def task1(grid, grid_width, grid_height):
    list_ptr = grid2list(grid, grid_width, grid_height)
    grid_size = grid_width * grid_height
    shp = (grid_size, )
    typ = np.float32
    xsum_ptr = cuda.to_device(np.zeros(shp, dtype=typ))
    ysum_ptr = cuda.to_device(np.zeros(shp, dtype=typ))
    print "xsum initialized to ", cuda.from_device(xsum_ptr, shp, typ)
    print "ysum initialized to ", cuda.from_device(ysum_ptr, shp, typ)
    func = mod.get_function("task1")
    func(list_ptr,
         xsum_ptr,
         ysum_ptr,
         grid=(grid_size, 1, 1),
         block=(32, 1, 1))
    pycuda.autoinit.context.synchronize()
    res_xsum = cuda.from_device(xsum_ptr, shp, typ)
    res_ysum = cuda.from_device(ysum_ptr, shp, typ)
    #xsum_ptr.free(), ysum_ptr.free()
    return xsum_ptr, ysum_ptr, res_xsum, res_ysum
Exemple #28
0
def nlargest_cpu(ann, n):
  """CPU implementation of nlargest."""
  outputs = driver.from_device(ann.outputs,
                               shape=(ann.popSize, ann.trainSize),
                               dtype=np.float32)

  thresholds = []
  for row in outputs:
    sortedRow = sorted(row, reverse=True)
    thresholds.append(sortedRow[n])

  return thresholds
Exemple #29
0
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, 
            surf, LorY, w, param, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event() 
        toc = cuda.Event() 
    else:
        tic = Event()
        toc = Event()

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mKtcDev = cuda.to_device(mKtc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    directKt_gpu = kernel.get_function("P2PKt")
    AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays


    directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, 
                surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, 
                surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, 
                surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), 
                numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
                numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return Ktx_gpu, Kty_gpu, Ktz_gpu
def task2(grid, grid_width, grid_height, scaling):
    mod = SourceModule(sourcestr.format(grid_width, grid_height))
    xsum_ptr, ysum_ptr, res_xsum, res_ysum= task1(grid, grid_width, grid_height)
    print "task 1 xsum", res_xsum
    print "task 1 ysum", res_ysum
    grid_size = grid_width * grid_height
    shp = (grid_size,)
    typ = np.float32
    xsum_out_ptr = cuda.to_device(np.zeros(shp, dtype=typ))
    ysum_out_ptr = cuda.to_device(np.zeros(shp, dtype=typ))
    func = mod.get_function("task2")
    func(xsum_ptr, ysum_ptr, xsum_out_ptr, ysum_out_ptr, grid=(grid_size, 1, 1), block=(1024,1,1))
    pycuda.autoinit.context.synchronize()
    res_xsum = cuda.from_device(xsum_out_ptr, shp, typ)
    res_ysum = cuda.from_device(ysum_out_ptr, shp, typ)
    for y in range(grid_height):
        for x in range(grid_width):
            cur_avgx, cur_avgy = res_xsum[x+y*grid_width], res_ysum[x+y*grid_width]
            for car in grid[(x,y)]:
                # car.vx = -car.vx
                # car.vy = -car.vy
                # car.vx += cur_avgx
                # car.vy += cur_avgy
                car.add_velocity( scale((cur_avgx, cur_avgy), scaling) )
Exemple #31
0
def go_sort_old(count, stream=None):
    data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
    ddata = cuda.to_device(data)
    print 'Done seeding'

    grids = count / 8192
    pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
    dpfxs = cuda.to_device(pfxs)

    launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    #dsplit = cuda.to_device(pfxs)
    #launch('crappy_split', dpfxs, dsplit,
            #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
    #if not stream:
        #split = cuda.from_device_like(dsplit, pfxs)
        #split_ = cuda.from_device_like(dsplit_, pfxs)
        #print np.all(split == split_)

    dshortseg_pfxs = cuda.mem_alloc(256 * 4)
    dshortseg_sums = cuda.mem_alloc(256 * 4)
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            dshortseg_pfxs, dshortseg_sums,
            block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1)

    dsorted = cuda.mem_alloc(count * 4)
    launch('sort_8', ddata, dsorted, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    launch('sort_8_a', ddata, dsorted, dpfxs, dsplit,
            block=(32, 32, 1), grid=(grids, 1), stream=stream)
    if not stream:
        sorted = cuda.from_device(dsorted, (count,), np.int32)
        f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
        sort_stat = f(rle(sorted))
        with open('dev.txt', 'w') as fp: fp.write(sort_stat)

        sorted_np = np.sort(data)
        np_stat = f(rle(sorted_np))
        with open('cpu.txt', 'w') as fp: fp.write(np_stat)

        print 'is_sorted?', np.all(sorted == sorted_np)
    def train_gpu(self, num_iter, model_file_path):
        if self.batch == 0:
            # Prepare to send the numpy array to gpu
            self.syn1_gpu = cuda.to_device(self.syn1)

            # Create word idx and related data-structure.
            self.base_word_rep = cuda.mem_alloc(
                len(self.dictionary) * WordRep.memsize)
            word_rep_ptr = int(self.base_word_rep)
            self.word_reps = {}
            for w_idx, word in sorted(self.dictionary.items()):
                word_code = 1 - 2 * self.words_rep[word][0].astype(
                    dtype=np.int32)
                word_point = self.words_rep[word][1].astype(dtype=np.int32)
                self.word_reps[w_idx] = WordRep(word_code, word_point,
                                                word_rep_ptr)
                word_rep_ptr += WordRep.memsize
            print "GPU transfers done."

        self.sent_reps_gpu = cuda.to_device(self.sent_reps)
        # Prepare sentences for GPU transfer.
        idx_sentences = [[
            self.dictionary.token2id[word] for word in sentence
            if word in self.dictionary
        ] for sentence in self.sentences]

        # Prepare the kernel function
        kernel = self.kernel_str.get_function("train_sg")
        words = np.empty(self.num_sents, dtype=np.int32)
        # sent_reps = np.copy(self.sent_reps)
        for iter in range(num_iter):
            # Sample words for each sentence and transfer to GPU
            for s_idx in range(self.num_sents):
                words[s_idx] = random.choice(idx_sentences[s_idx])
            words_gpu = cuda.to_device(words)
            kernel(self.sent_reps_gpu,
                   np.float32(self.alpha),
                   words_gpu,
                   self.base_word_rep,
                   self.syn1_gpu,
                   block=(self.size, 1, 1),
                   grid=(self.num_sents, 1, 1))
            # autoinit.context.synchronize()
        self.sent_reps = cuda.from_device(self.sent_reps_gpu,
                                          self.sent_reps.shape,
                                          self.sent_reps.dtype)
        pickle_dump(self.sent_reps, model_file_path)
Exemple #33
0
def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import time

    nthreads = blockwidth * nblocks
    seeds = make_seeds(nthreads, host_seed=42)
    dseeds = cuda.to_device(seeds)

    mod = SourceModule(assemble_code(mwctestlib))

    for trial in range(2):
        print "Trial %d, on CPU: " % trial,
        sums = np.zeros(nthreads, dtype=np.uint64)
        ctime = time.time()
        mults = seeds[:, 0].astype(np.uint64)
        states = seeds[:, 1]
        carries = seeds[:, 2]
        for i in range(rounds):
            step = np.frombuffer((mults * states + carries).data,
                                 dtype=np.uint32).reshape((nthreads, 2))
            states[:] = step[:, 0]
            carries[:] = step[:, 1]
            sums += states

        ctime = time.time() - ctime
        print "Took %g seconds." % ctime

        print "Trial %d, on device: " % trial,
        dsums = cuda.mem_alloc(8 * nthreads)
        fun = mod.get_function("test_mwc")
        dtime = fun(dseeds,
                    dsums,
                    np.float32(rounds),
                    block=(blockwidth, 1, 1),
                    grid=(nblocks, 1),
                    time_kernel=True)
        print "Took %g seconds." % dtime
        dsums = cuda.from_device(dsums, nthreads, np.uint64)
        if not np.all(np.equal(sums, dsums)):
            print "Sum discrepancy!"
            print sums
            print dsums
Exemple #34
0
def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import time

    nthreads = blockwidth * nblocks
    seeds = make_seeds(nthreads, host_seed=42)
    dseeds = cuda.to_device(seeds)

    mod = SourceModule(assemble_code(mwctestlib))

    for trial in range(2):
        print "Trial %d, on CPU: " % trial,
        sums = np.zeros(nthreads, dtype=np.uint64)
        ctime = time.time()
        mults = seeds[0].astype(np.uint64)
        states = seeds[1]
        carries = seeds[2]
        for i in range(rounds):
            step = np.frombuffer((mults * states + carries).data,
                       dtype=np.uint32).reshape((2, nthreads), order='F')
            states[:] = step[0]
            carries[:] = step[1]
            sums += states

        ctime = time.time() - ctime
        print "Took %g seconds." % ctime

        print "Trial %d, on device: " % trial,
        dsums = cuda.mem_alloc(8*nthreads)
        fun = mod.get_function("test_mwc")
        dtime = fun(dseeds, dsums, np.float32(rounds),
                    block=(blockwidth,1,1), grid=(nblocks,1),
                    time_kernel=True)
        print "Took %g seconds." % dtime
        dsums = cuda.from_device(dsums, nthreads, np.uint64)
        if not np.all(np.equal(sums, dsums)):
            print "Sum discrepancy!"
            print sums
            print dsums
Exemple #35
0
        def test_stub(shift, trials=10, rounds=1):
            # Run once so that evt_a doesn't include initialization time
            sorter.multisort(dout_a,
                             dout_b,
                             dkeys,
                             count,
                             shift,
                             rounds,
                             stream=stream)
            evt_a = cuda.Event().record(stream)
            for i in range(trials):
                buf = sorter.multisort(dout_a,
                                       dout_b,
                                       dkeys,
                                       count,
                                       shift,
                                       rounds,
                                       stream=stream)
            evt_b = cuda.Event().record(stream)
            evt_b.synchronize()
            dur = evt_b.time_since(evt_a) / (rounds * trials)
            print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000),
                                             count * sorter.radix_bits /
                                             (dur * 32 * 1000))

            if shift == 0 and correctness:
                print '\nTesting correctness'
                out = cuda.from_device(buf, (count, ), np.uint32)
                sort = np.sort(keys)
                if np.all(out == sort):
                    print 'Correct'
                else:
                    nz = np.nonzero(out != sort)[0]
                    print sorted(set(nz >> 13))
                    for i in nz:
                        print i, out[i - 1:i + 2], sort[i - 1:i + 2]
                    assert False, 'Oh no'
Exemple #36
0
 def _print_interp_knots(self, rdr, tsidx=5):
     infos = cuda.from_device(self.info_a.d_params,
             (tsidx + 1, len(rdr.packer)), f32)
     for i, n in zip(infos[-1], rdr.packer.packed):
         print '%60s %g' % ('_'.join(n), i)
Exemple #37
0
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing,
               kernel):
    """
    It computes the adjoint double layer potential.

    Arguments
    ----------
    XKt    : array, input for the adjoint double layer potential.
    LorY   : int, Laplace (1) or Yukawa (2).
    surfSrc: class, source surface, the one that contains the gauss points.
    surfTar: class, target surface, the one that contains the collocation points.
    Kt_diag: array, diagonal elements of the adjoint double layer integral
                    operator.
    self   : int, position in the surface array of the source surface.
    param  : class, parameters related to the surface.
    ind0   : array, it contains the indices related to the treecode computation.
    timing : class, it contains timing information for different parts of the
                    code.
    kernel : pycuda source module.

    Returns
    --------
    Kt_lyr: array, adjoint double layer potential.
    """

    if param.GPU == 1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)

    tic.record()
    K = param.K
    w = getWeights(K)
    X_Kt = numpy.zeros(Ns * K)
    X_Ktc = numpy.zeros(Ns * K)

    NsK = numpy.arange(Ns * K)
    X_Kt[:] = XKt[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K]
    X_Ktc[:] = XKt[NsK // K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc) * 1e-3

    tic.record()
    C = 0
    X_aux = numpy.zeros(Ns * K)
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt,
                 X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc) * 1e-3

    tic.record()
    for C in reversed(range(1, len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK,
                    ind0.index, ind0.combII, ind0.combJJ, ind0.combKK,
                    ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small,
                    ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc) * 1e-3

    tic.record()
    X_Kt = X_Kt[surfSrc.sortSource]
    X_Ktc = X_Ktc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    param.Nround = len(surfTar.twig) * param.NCRIT
    Ktx_aux = numpy.zeros(param.Nround)
    Kty_aux = numpy.zeros(param.Nround)
    Ktz_aux = numpy.zeros(param.Nround)

    ### CPU code
    if param.GPU == 0:
        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(
                surfSrc, surfTar, Ktx_aux, Kty_aux, Ktz_aux, self,
                ind0.index_large, param, LorY, timing)

        Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc,
                                               Ktx_aux, Kty_aux, Ktz_aux, self,
                                               LorY, w, param, timing)

    ### GPU code
    elif param.GPU == 1:
        Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL))
        Kty_gpu = cuda.to_device(Kty_aux.astype(REAL))
        Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL))

        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu,
                                                  Kty_gpu, Ktz_gpu, self, ind0,
                                                  param, LorY, timing, kernel)

        Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc,
                                              Ktx_gpu, Kty_gpu, Ktz_gpu, self,
                                              LorY, w, param, timing, kernel)

        tic.record()
        Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL)
        Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL)
        Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc) * 1e-3

    tic.record()
    Kt_lyr = (Ktx_aux[surfTar.unsort]*surfTar.normal[:,0] +
              Kty_aux[surfTar.unsort]*surfTar.normal[:,1] +
              Ktz_aux[surfTar.unsort]*surfTar.normal[:,2])

    if abs(Kt_diag) > 1e-12:  # if same surface
        Kt_lyr += Kt_diag * XKt

    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    return Kt_lyr
Exemple #38
0
 def send_result(s):
     mpi.world.send(0, 10, cuda.from_device(s.arr_gpu, s.shape, s.dtype))
Exemple #39
0
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE,
            self, param, ind0, timing, kernel):

    tic = cuda.Event()
    toc = cuda.Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    Nt = len(surfTar.triangle)
    L = numpy.sqrt(2*surfSrc.Area) # Representative length

    tic.record()
    K = param.K
    w = getWeights(K)
    X_V  = numpy.zeros(Ns*K)
    X_Kx = numpy.zeros(Ns*K)
    X_Ky = numpy.zeros(Ns*K)
    X_Kz = numpy.zeros(Ns*K)
    X_Kc = numpy.zeros(Ns*K)
    X_Vc = numpy.zeros(Ns*K)

    NsK = numpy.arange(Ns*K)
    X_V[:]  = XV[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]
    X_Kx[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,0]
    X_Ky[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,1]
    X_Kz[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,2]
    X_Kc[:] = XK[NsK/K]
    X_Vc[:] = XV[NsK/K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc)*1e-3

    tic.record()
    C = 0
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, 
                    X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc)*1e-3


    tic.record()
    for C in reversed(range(1,len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, 
                    ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc)*1e-3

    tic.record()
    X_V = X_V[surfSrc.sortSource]
    X_Kx = X_Kx[surfSrc.sortSource]
    X_Ky = X_Ky[surfSrc.sortSource]
    X_Kz = X_Kz[surfSrc.sortSource]
    X_Kc = X_Kc[surfSrc.sortSource]
    X_Vc = X_Vc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc)*1e-3

    param.Nround = len(surfTar.twig)*param.NCRIT
    K_aux  = numpy.zeros(param.Nround)
    V_aux  = numpy.zeros(param.Nround)
    AI_int = 0

    ### CPU code
    if param.GPU==0:
        K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, 
                                ind0.index_large, param, LorY, timing)

        K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, 
                                K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing)

    ### GPU code
    elif param.GPU==1:
        K_gpu = cuda.to_device(K_aux.astype(REAL))
        V_gpu = cuda.to_device(V_aux.astype(REAL))

        if surfTar.offsetMlt[self,len(surfTar.twig)]>0:
            K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, 
                                    ind0, param, LorY, timing, kernel)

        K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, 
                                K_gpu, V_gpu, self, LorY, K_diag, IorE, L, w, param, timing, kernel)

        tic.record()
        K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL)
        V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc)*1e-3

    tic.record()
    K_lyr = K_aux[surfTar.unsort]
    V_lyr = V_aux[surfTar.unsort]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc)*1e-3

    return K_lyr, V_lyr 
Exemple #40
0
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    MV = numpy.zeros(len(XK))
    L = numpy.sqrt(2*surface.Area) # Representative length
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w    = getWeights(K)
    X_V = numpy.zeros(N*K)
    X_Kx = numpy.zeros(N*K)
    X_Ky = numpy.zeros(N*K)
    X_Kz = numpy.zeros(N*K)
    X_Kc = numpy.zeros(N*K)
    X_Vc = numpy.zeros(N*K)

    for i in range(N*K):
        X_V[i]   = XV[i/K]*w[i%K]*surface.Area[i/K]
        X_Kx[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
        X_Ky[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
        X_Kz[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
        X_Kc[i]  = XK[i/K]
        X_Vc[i]  = XV[i/K]
    
    toc = time.time()
    time_set = toc - tic
    
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu   = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu  = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu  = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu  = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))


    get_phir = kernel.get_function("get_phir")
    
    GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))

    get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, 
            surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, 
            int32(len(surface.xj)), int32(Nq), int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
            AI_int_gpu, int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))

    AI_aux = numpy.zeros(Nq, dtype=int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
Exemple #41
0
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag,
                self, param, ind0, timing, kernel):

    tic = cuda.Event()
    toc = cuda.Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    Nt = len(surfTar.triangle)
    L = numpy.sqrt(2*surfSrc.Area) # Representative length

    tic.record()
    K = param.K
    w    = getWeights(K)
    X_Kt = numpy.zeros(Ns*K)
    X_Ktc = numpy.zeros(Ns*K)

    NsK = numpy.arange(Ns*K)
    X_Kt[:]  = XKt[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]
    X_Ktc[:] = XKt[NsK/K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc)*1e-3

    tic.record()
    C = 0
    X_aux = numpy.zeros(Ns*K)
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, 
                    X_Kt, X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc)*1e-3


    tic.record()
    for C in reversed(range(1,len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, 
                    ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc)*1e-3

    tic.record()
    X_Kt = X_Kt[surfSrc.sortSource]
    X_Ktc = X_Ktc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc)*1e-3

    param.Nround = len(surfTar.twig)*param.NCRIT
    Ktx_aux  = numpy.zeros(param.Nround)
    Kty_aux  = numpy.zeros(param.Nround)
    Ktz_aux  = numpy.zeros(param.Nround)
    AI_int = 0

    ### CPU code
    if param.GPU==0:
        if surfTar.offsetMlt[self,len(surfTar.twig)]>0:
            Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(surfSrc, surfTar, Ktx_aux, Kty_aux, Ktz_aux, self, 
                                    ind0.index_large, param, LorY, timing)

        Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc, 
                            Ktx_aux, Kty_aux, Ktz_aux, self, LorY, w, param, timing)

    ### GPU code
    elif param.GPU==1:
        Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL))
        Kty_gpu = cuda.to_device(Kty_aux.astype(REAL))
        Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL))

        if surfTar.offsetMlt[self,len(surfTar.twig)]>0:
            Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, 
                                    Ktx_gpu, Kty_gpu, Ktz_gpu, self, 
                                    ind0, param, LorY, timing, kernel)

        Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc, Ktx_gpu, Kty_gpu, Ktz_gpu, 
                                self, LorY, w, param, timing, kernel)

        tic.record()
        Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL)
        Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL)
        Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc)*1e-3

    tic.record()
    Kt_lyr = Ktx_aux[surfTar.unsort]*surfTar.normal[:,0] \
           + Kty_aux[surfTar.unsort]*surfTar.normal[:,1] \
           + Ktz_aux[surfTar.unsort]*surfTar.normal[:,2] 

    if abs(Kt_diag)>1e-12: # if same surface
        Kt_lyr += Kt_diag * XKt

    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc)*1e-3

    return Kt_lyr
 def from_array(cls, a):
     foobar_array = cuda.from_device(a[0], 1, dtype=np.uint32)
     data = cuda.from_device(a[1], 10, dtype=np.int32)
     return cls(foobar_array[0], data)
Exemple #43
0
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param,
            ind0, timing, kernel):
    """
    It computes the single and double layer potentials.

    Arguments
    ----------
    XK     : array, input for the double layer potential.
    XV     : array, input for the single layer potential.
    LorY   : int, Laplace (1) or Yukawa (2).
    surfSrc: class, source surface, the one that contains the gauss points.
    surfTar: class, target surface, the one that contains the collocation
                    points.
    K_diag : array, diagonal elements of the double layer integral operator.
    V_diag : array, diagonal elements of the single layer integral operator.
    IorE   : int, internal (1) or external (2).
    self   : int, position in the surface array of the source surface.
    param  : class, parameters related to the surface.
    ind0   : array, it contains the indices related to the treecode computation.
    timing : class, it contains timing information for different parts of the
                    code.
    kernel : pycuda source module.

    Returns
    --------
    K_lyr  : array, double layer potential.
    V_lyr  : array, single layer potential.
    """

    if param.GPU == 1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    L = numpy.sqrt(2 * surfSrc.area)  # Representative length

    tic.record()
    K = param.K
    w = getWeights(K)
    X_V = numpy.zeros(Ns * K)
    X_Kx = numpy.zeros(Ns * K)
    X_Ky = numpy.zeros(Ns * K)
    X_Kz = numpy.zeros(Ns * K)
    X_Kc = numpy.zeros(Ns * K)
    X_Vc = numpy.zeros(Ns * K)

    NsK = numpy.arange(Ns * K)
    X_V[:] = XV[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K]
    X_Kx[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[
        NsK // K] * surfSrc.normal[NsK // K, 0]
    X_Ky[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[
        NsK // K] * surfSrc.normal[NsK // K, 1]
    X_Kz[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[
        NsK // K] * surfSrc.normal[NsK // K, 2]
    X_Kc[:] = XK[NsK // K]
    X_Vc[:] = XV[NsK // K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc) * 1e-3

    tic.record()
    C = 0
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V,
                 X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc) * 1e-3

    tic.record()
    for C in reversed(range(1, len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK,
                    ind0.index, ind0.combII, ind0.combJJ, ind0.combKK,
                    ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small,
                    ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc) * 1e-3

    tic.record()
    X_V = X_V[surfSrc.sortSource]
    X_Kx = X_Kx[surfSrc.sortSource]
    X_Ky = X_Ky[surfSrc.sortSource]
    X_Kz = X_Kz[surfSrc.sortSource]
    X_Kc = X_Kc[surfSrc.sortSource]
    X_Vc = X_Vc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    param.Nround = len(surfTar.twig) * param.NCRIT
    K_aux = numpy.zeros(param.Nround)
    V_aux = numpy.zeros(param.Nround)

    ### CPU code
    if param.GPU == 0:
        K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self,
                                ind0.index_large, param, LorY, timing)

        K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc,
                                X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag,
                                IorE, L, w, param, timing)

    ### GPU code
    elif param.GPU == 1:
        K_gpu = cuda.to_device(K_aux.astype(REAL))
        V_gpu = cuda.to_device(V_aux.astype(REAL))

        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0,
                                   param, LorY, timing, kernel)

        K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc,
                               X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L,
                               w, param, timing, kernel)

        tic.record()
        K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL)
        V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc) * 1e-3

    tic.record()
    K_lyr = K_aux[surfTar.unsort]
    V_lyr = V_aux[surfTar.unsort]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    return K_lyr, V_lyr
Exemple #44
0
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE,
            self, param, ind0, timing, kernel):

    tic = cuda.Event()
    toc = cuda.Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    Nt = len(surfTar.triangle)
    L = sqrt(2*surfSrc.Area) # Representative length

    tic.record()
    K = param.K
    w    = getWeights(K)
    X_V  = zeros(Ns*K)
    X_Kx = zeros(Ns*K)
    X_Ky = zeros(Ns*K)
    X_Kz = zeros(Ns*K)
    X_Kc = zeros(Ns*K)
    X_Vc = zeros(Ns*K)

    NsK = arange(Ns*K)
    X_V[:]  =  XV[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]
    X_Kx[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,0]
    X_Ky[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,1]
    X_Kz[:] = XK[NsK/K]*w[NsK%K]*surfSrc.Area[NsK/K]*surfSrc.normal[NsK/K,2]
    X_Kc[:] = XK[NsK/K]
    X_Vc[:] = XV[NsK/K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc)*1e-3

    tic.record()
    C = 0
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, 
                    X_V, X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc)*1e-3


    tic.record()
    for C in reversed(range(1,len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK, ind0.index, ind0.combII, ind0.combJJ, 
                    ind0.combKK, ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small, ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc)*1e-3

    tic.record()
    X_V = X_V[surfSrc.sortSource]
    X_Kx = X_Kx[surfSrc.sortSource]
    X_Ky = X_Ky[surfSrc.sortSource]
    X_Kz = X_Kz[surfSrc.sortSource]
    X_Kc = X_Kc[surfSrc.sortSource]
    X_Vc = X_Vc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc)*1e-3

    param.Nround = len(surfTar.twig)*param.NCRIT
    K_aux  = zeros(param.Nround)
    V_aux  = zeros(param.Nround)
    AI_int = 0

    ### CPU code
    if param.GPU==0:
        K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self, 
                                ind0.index_large, param, LorY, timing)

        K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, 
                                K_aux, V_aux, self, LorY, K_diag, V_diag, IorE, L, w, param, timing)

    ### GPU code
    elif param.GPU==1:
        K_gpu = cuda.to_device(K_aux.astype(REAL))
        V_gpu = cuda.to_device(V_aux.astype(REAL))

        if surfTar.offsetMlt[self,len(surfTar.twig)]>0:
            K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, 
                                    ind0, param, LorY, timing, kernel)

        K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc, X_Vc, 
                                K_gpu, V_gpu, self, LorY, K_diag, V_diag, IorE, L, w, param, timing, kernel)

        tic.record()
        K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL)
        V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc)*1e-3

    tic.record()
    K_lyr = K_aux[surfTar.unsort]
    V_lyr = V_aux[surfTar.unsort]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc)*1e-3

    return K_lyr, V_lyr 
 def send(s, rank, tag_mark):
     a = cuda.from_device(s.arr_gpu, (6, 5), np.float32)
     mpi.world.send(rank, tag_mark, a)
	def send(s,rank,tag_mark):
		a = cuda.from_device(s.arr_gpu,(6,5),np.float32)
		mpi.world.send( rank,tag_mark, a)
a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)
c_gpu = cuda.mem_alloc(c.nbytes)

# use the normal kernel
from pycuda.compiler import SourceModule

mod = SourceModule(kernels.replace("NN", str(nx * ny)))
cumul = mod.get_function("cumul")
cumul(a_gpu, b_gpu, c_gpu, block=(256, 1, 1), grid=(2, 1))
cuda.memcpy_dtoh(c, c_gpu)
assert np.linalg.norm(c - a * b) < 1e-6

# use the gpuarray
a_ga = gpuarray.to_gpu(a)
b_ga = gpuarray.to_gpu(b)
assert np.linalg.norm((a_ga * b_ga).get() - a * b) < 1e-6

# arguments
cumul(a_ga, b_ga, c_gpu, block=(256, 1, 1), grid=(2, 1))
cuda.memcpy_dtoh(c, c_gpu)
assert np.linalg.norm(c - a * b) < 1e-6

# sub-area memcpy from gpuarray
a_sub = np.zeros(100, dtype=np.complex64)
a_sub[:] = cuda.from_device(int(a_ga.gpudata) + 900 * np.nbytes["complex64"], (100,), "complex64")
print a_sub

d_g = gpuarray.zeros((10, 13), dtype=np.complex64)
print cuda.from_device(int(d_g.gpudata) + 100 * np.nbytes["complex64"], (30,), "complex64")
Exemple #48
0
def get_phir_gpu(XK, XV, surface, field, par_reac, kernel):
    """
    It computes the reaction potential on the GPU  and it brings the data
    to the cpu.

    Arguments
    ----------
    XK      : array, input for the double layer potential.
    XV      : array, input for the single layer potential.
    surface : class, surface where we are computing the reaction potential.
    field   : class, information about the different regions in the molecule.
    par_reac: class, fine parameters related to the surface.

    Returns
    --------
    phir_cpu: array, reaction potential brought from the GPU to the cpu.
    AI_int  : int, counter of the amount of near singular integrals solved.
    """

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w = getWeights(K)
    X_V = numpy.zeros(N * K)
    X_Kx = numpy.zeros(N * K)
    X_Ky = numpy.zeros(N * K)
    X_Kz = numpy.zeros(N * K)
    X_Kc = numpy.zeros(N * K)
    X_Vc = numpy.zeros(N * K)

    for i in range(N * K):
        X_V[i] = XV[i // K] * w[i % K] * surface.area[i // K]
        X_Kx[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 0]
        X_Ky[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 1]
        X_Kz[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 2]
        X_Kc[i] = XK[i // K]
        X_Vc[i] = XV[i // K]

    toc = time.time()
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))

    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq) / par_reac.BSZ))

    get_phir(phir,
             field.xq_gpu,
             field.yq_gpu,
             field.zq_gpu,
             m_gpu,
             mx_gpu,
             my_gpu,
             mz_gpu,
             mKc_gpu,
             mVc_gpu,
             surface.xjDev,
             surface.yjDev,
             surface.zjDev,
             surface.AreaDev,
             surface.kDev,
             surface.vertexDev,
             numpy.int32(len(surface.xj)),
             numpy.int32(Nq),
             numpy.int32(par_reac.K),
             xkDev,
             wkDev,
             REAL(par_reac.threshold),
             AI_int_gpu,
             numpy.int32(len(surface.xk)),
             surface.XskDev,
             surface.WskDev,
             block=(par_reac.BSZ, 1, 1),
             grid=(GSZ, 1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
	def send(s, rank, tag_mark, arr_gpu):
		if mpi.rank > rank: offset_gpu = int(arr_gpu)+s.ny*nbof
		elif mpi.rank < rank: offset_gpu = int(arr_gpu)+(s.nx-2)*s.ny*nbof
		mpi.world.send(rank, tag_mark, cuda.from_device(offset_gpu, (s.ny,), s.f.dtype))
Exemple #50
0
def get_phir_gpu(XK, XV, surface, field, par_reac, kernel):
    """
    It computes the reaction potential on the GPU  and it brings the data
    to the cpu.

    Arguments
    ----------
    XK      : array, input for the double layer potential.
    XV      : array, input for the single layer potential.
    surface : class, surface where we are computing the reaction potential.
    field   : class, information about the different regions in the molecule.
    par_reac: class, fine parameters related to the surface.

    Returns
    --------
    phir_cpu: array, reaction potential brought from the GPU to the cpu.
    AI_int  : int, counter of the amount of near singular integrals solved.
    """

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w = getWeights(K)
    X_V = numpy.zeros(N * K)
    X_Kx = numpy.zeros(N * K)
    X_Ky = numpy.zeros(N * K)
    X_Kz = numpy.zeros(N * K)
    X_Kc = numpy.zeros(N * K)
    X_Vc = numpy.zeros(N * K)

    for i in range(N * K):
        X_V[i] = XV[i // K] * w[i % K] * surface.area[i // K]
        X_Kx[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 0]
        X_Ky[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 1]
        X_Kz[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 2]
        X_Kc[i] = XK[i // K]
        X_Vc[i] = XV[i // K]

    toc = time.time()
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))

    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq) / par_reac.BSZ))

    get_phir(phir,
             field.xq_gpu,
             field.yq_gpu,
             field.zq_gpu,
             m_gpu,
             mx_gpu,
             my_gpu,
             mz_gpu,
             mKc_gpu,
             mVc_gpu,
             surface.xjDev,
             surface.yjDev,
             surface.zjDev,
             surface.AreaDev,
             surface.kDev,
             surface.vertexDev,
             numpy.int32(len(surface.xj)),
             numpy.int32(Nq),
             numpy.int32(par_reac.K),
             xkDev,
             wkDev,
             REAL(par_reac.threshold),
             AI_int_gpu,
             numpy.int32(len(surface.xk)),
             surface.XskDev,
             surface.WskDev,
             block=(par_reac.BSZ, 1, 1),
             grid=(GSZ, 1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
	def show(s, a):
		print a
		print cuda.from_device( s.arr_gpu, (6,5), np.float32 ).T
Exemple #52
0
	def print_arr_gpu(s):
		print cuda.from_device(s.arr_gpu,s.shape,s.dtype)
 def show(s, a):
     print a
     print cuda.from_device(s.arr_gpu, (6, 5), np.float32).T
Exemple #54
0
 def __repr__(self):
     return "{}\n{}".format(
         cuda.from_device(self.x_ptr, self.shape, self.dtype),
         cuda.from_device(self.y_ptr, self.shape, self.dtype))
                             block=(256, 1, 1),
                             grid=(1, 1))

    diagonal_tr_bl_aggregate(dp_ptr,
                             cost_images_ptr,
                             rows,
                             cols,
                             block=(256, 1, 1),
                             grid=(1, 1))
    diagonal_br_tl_aggregate(dp_ptr,
                             cost_images_ptr,
                             rows,
                             cols,
                             block=(256, 1, 1),
                             grid=(1, 1))

    agg_image = drv.from_device(dp_ptr, cost_images.shape, dtype=np.float32)
    print(f"aggregation time {time() - t1}")
    #agg_image = agg_image.transpose((2,1,0))
    min_cost_im = np.argmin(agg_image, axis=0)
    gt = np.argmin(L, axis=0)
    agg_image = agg_image.transpose((2, 1, 0))
    min_cost_im += 1
    gt += 1
    im = stereo.compute_depth(np.int32(min_cost_im))
    gt_im = stereo.compute_depth(np.int32(gt))
    ##im = stereo.normalize(im, 0.1)
    ##gt_im = stereo.normalize(gt_im, 0.1)
    np.save("../out_images/testim.npy", im)
    np.save("../out_images/gtim.npy", im)
Exemple #56
0
 def __str__(self):
     return str(cuda.from_device(self.data, self.shape, self.dtype))
def send(rank, tag_mark, nx, ny, arr_gpu):
	if mpi.rank > rank: offset_gpu = int(arr_gpu)+(ny+1)*nbof
	elif mpi.rank < rank: offset_gpu = int(arr_gpu)+((nx-2)*ny+1)*nbof
	mpi.world.send(rank, tag_mark, cuda.from_device(offset_gpu, (ny-2,), dtof))
def print_gpu(s):
	#s.show()
	print cuda.from_device( s.arr_gpu, (6,5), np.float32 )