Python gridの例、numba.cuda.grid Pythonの例

コード例 #1

0

ファイルを表示

ファイル: multigpu_mt.py プロジェクト: ContinuumIO/numbapro-examples

def kernel(dst, src):
    '''A simple kernel that adds 1 to every item
    '''
    i = cuda.grid(1)
    if i >= dst.shape[0]:
        return
    dst[i] = src[i] + 1

コード例 #2

0

ファイルを表示

ファイル: singlelink.py プロジェクト: Chiroptera/masters_code

def removeEdges(edgeList, sortedArgs, n_discarded):
    """
    inputs:
        edgeList         : list of edges
        sortedArgs         : argument list of the sorted weight list
        n_discarded     : number of edges to be discarded specified in sortedArgs

    Remove discarded edges form the edge list.
    Each edge discarded is replaced by -1.

    Discard edges specified by the last n_discarded arguments
    in the sortedArgs list.

    """

    tgid = cuda.grid(1)

    # one thread per edge that must be discarded
    # total number of edges to be discarded is the difference 
    # between the between the total number of edges and the 
    # number of edges to be considered + the number edges 
    # to be discarded

    if tgid >= n_discarded:
        return

    # remove not considered edges
    elif tgid < n_considered_edges:
        maxIdx = edgeList.size - 1 # maximum index of sortedArgs
        index = maxIdx - tgid # index of 
        edgeList[index] = -1

コード例 #3

0

ファイルを表示

ファイル: singlelink.py プロジェクト: Chiroptera/masters_code

def compute_lifetimes_CUDA(nweight, lifetimes):
    edge = cuda.grid(1)
    
    if edge >= lifetimes.size:
        return
    
    lifetimes[edge] = nweight[edge + 1] - nweight[edge]

コード例 #4

0

ファイルを表示

ファイル: singlelink.py プロジェクト: Chiroptera/masters_code

def getWeightsOfEdges_gpu(edges, n_edges, weights, nweights):
    """
    This function will take a list of edges (edges), the number of edges to 
    consider (n_edges, the weights of all the possible edges (weights) and the 
    array for the weights of the list of edges and put the weight of each edge 
    in the list of edges in the nweights, in the same position.

    The kernel will also discard not considered edges, i.e. edges whose 
    argument >= n_edges.
    Discarding an edge is done by replacing the edge by -1.
    """
    # n_edges_sm = cuda.shared.array(1, dtype = int32)
    edge = cuda.grid(1)

    if edge >= edges.size:
        return
    
    # if edge == 0:
    #     n_edges_sm[0] = n_edges[0]
    # cuda.syncthreads()
    
    
    # if edge >= n_edges_sm[0]:
    if edge >= n_edges[0]:
        edges[edge] = -1
    else:
        myEdgeID = edges[edge]
        nweights[edge] = weights[myEdgeID]

コード例 #5

0

ファイルを表示

ファイル: build.py プロジェクト: Chiroptera/masters_code

def addEdges(edges, n_edges, dest, weight, fe, od, top_edge, ndest, nweight):
    n_edges_sm = cuda.shared.array(0, dtype = int32)

    edge = cuda.grid(1)

    # if edge == 0:
    #     n_edges_sm[0] = n_edges[0]

    key = edges[edge]

    # if edge is -1 it was marked for removal
    if key == -1:
        return

    o_v = dest[key]
    i_v = binaryOriginVertexSearch_CUDA(key, dest, fe, od)

    # get and increment pointers for each vertex
    i_ptr = cuda.atomic.add(top_edge, i_v, 1)
    o_ptr =cuda.atomic.add(top_edge, o_v, 1)

    # add edges to destination array
    ndest[i_ptr] = o_v
    ndest[o_ptr] = i_v

    # add weight to edges
    edge_w = weight[key]
    nweight[i_ptr] = edge_w
    nweight[o_ptr] = edge_w

コード例 #6

0

ファイルを表示

ファイル: gpu_flat.py プロジェクト: ZhouHUB/pyIID

def get_grad_omega(grad_omega, omega, r, d, qbin):
    """
    Get the gradient of the Debye sum with respect to atomic positions

    Parameters
    ----------
    grad_omega: kx3xQ array
        The gradient
    omega: kxQ array
        Debye sum
    r: k array
        The pair distance array
    d: kx3 array
        The pair displacements
    qbin: float
        The qbin size
    """
    kmax, _, qmax_bin = grad_omega.shape
    k, qx = cuda.grid(2)
    if k >= kmax or qx >= qmax_bin:
        return
    sv = f4(qx) * qbin
    rk = r[k]
    a = (sv * math.cos(sv * rk)) - omega[k, qx]
    a /= rk * rk
    for w in range(i4(3)):
        grad_omega[k, w, qx] = a * d[k, w]

コード例 #7

0

ファイルを表示

ファイル: test_minmax.py プロジェクト: numba/numba

def builtin_max(A, B, C):
    i = cuda.grid(1)

    if i >= len(C):
        return

    C[i] = float64(max(A[i], B[i]))

コード例 #8

0

ファイルを表示

ファイル: cudautils.py プロジェクト: xennygrimmato/pygdf

def gpu_expand_mask_bits(bits, out):
    """Expand each bits in bitmask *bits* into an element in out.
    This is a flexible kernel that can be launch with any number of blocks
    and threads.
    """
    for i in range(cuda.grid(1), out.size, cuda.gridsize(1)):
        out[i] = mask_get(bits, i)

コード例 #9

0

ファイルを表示

ファイル: gpu_flat.py プロジェクト: ZhouHUB/pyIID

def d2_to_d1_sum(d1, d2):
    qx = cuda.grid(1)

    if qx >= len(d1):
        return
    tmp = d2[:, qx].sum()
    d1[qx] = tmp

コード例 #10

0

ファイルを表示

ファイル: cudautils.py プロジェクト: xennygrimmato/pygdf

def gpu_gather(data, index, out):
    i = cuda.grid(1)
    if i < index.size:
        idx = index[i]
        # Only do it if the index is in range
        if 0 <= idx < data.size:
            out[i] = data[idx]

コード例 #11

0

ファイルを表示

ファイル: accel_math.py プロジェクト: mperrin/poppy

    def cufftShift_2D_kernel(data, N):
        """
        adopted CUDA FFT shift code from:
        https://github.com/marwan-abdellah/cufftShift
        (GNU Lesser Public License)
        """

        # // 2D Slice & 1D Line
        sLine = N
        sSlice = N * N
        # // Transformations Equations
        sEq1 = int((sSlice + sLine) / 2)
        sEq2 = int((sSlice - sLine) / 2)
        x, y = cuda.grid(2)
        # // Thread Index Converted into 1D Index
        index = (y * N) + x

        if x < N / 2:
            if y < N / 2:
                # // First Quad
                temp = data[index]
                data[index] = data[index + sEq1]
                # // Third Quad
                data[index + sEq1] = temp
        else:
            if y < N / 2:
                # // Second Quad
                temp = data[index]
                data[index] = data[index + sEq2]
                data[index + sEq2] = temp

コード例 #12

0

ファイルを表示

ファイル: cuda_utils.py プロジェクト: GoldDragonMoon/SDNN_python

def lateral_inh(S, V, K_inh):

    idx, idy, idz = cuda.grid(3)
    if idx > V.shape[0] - 1:
        return
    if idy > V.shape[1] - 1:
        return
    if idz > V.shape[2] - 1:
        return

    # if neuron has not fired terminate the thread
    if S[idx, idy, idz] != 1:
        return

    # if a neuron in this position has fired before do not fire again
    if K_inh[idx, idy] == 0:
        S[idx, idy, idz] = 0
        return

    # neuron at this position but in other input map
    for k in range(V.shape[2]):
        if S[idx, idy, k] == 1 and V[idx, idy, idz] < V[idx, idy, k]:
            S[idx, idy, idz] = 0
            return
    K_inh[idx, idy] = 0

コード例 #13

0

ファイルを表示

ファイル: testilp.py プロジェクト: ContinuumIO/numbapro-examples

def vec_add_ilp_x4(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    # write
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl

コード例 #14

0

ファイルを表示

def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    # blocks per grid

    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    C[x, y] = tmp

コード例 #15

0

ファイルを表示

ファイル: testilp.py プロジェクト: ContinuumIO/numbapro-examples

def vec_add_ilp_x8(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    m = l + stride
    am = a[m]
    bm = b[m]

    n = m + stride
    an = a[n]
    bn = b[n]

    o = n + stride
    ao = a[o]
    bo = b[o]

    p = o + stride
    ap = a[o]
    bp = b[o]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    cm = core(am, bm)
    cn = core(an, bn)
    co = core(ao, bo)
    cp = core(ap, bp)

    # write
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl

    c[m] = cm
    c[n] = cn
    c[o] = co
    c[p] = cp

コード例 #16

0

ファイルを表示

ファイル: test_linker.py プロジェクト: esc/numba

def function_with_lots_of_registers(x, a, b, c, d, e, f):
    a1 = 1.0
    a2 = 1.0
    a3 = 1.0
    a4 = 1.0
    a5 = 1.0
    b1 = 1.0
    b2 = 1.0
    b3 = 1.0
    b4 = 1.0
    b5 = 1.0
    c1 = 1.0
    c2 = 1.0
    c3 = 1.0
    c4 = 1.0
    c5 = 1.0
    d1 = 10
    d2 = 10
    d3 = 10
    d4 = 10
    d5 = 10
    for i in range(a):
        a1 += b
        a2 += c
        a3 += d
        a4 += e
        a5 += f
        b1 *= b
        b2 *= c
        b3 *= d
        b4 *= e
        b5 *= f
        c1 /= b
        c2 /= c
        c3 /= d
        c4 /= e
        c5 /= f
        d1 <<= b
        d2 <<= c
        d3 <<= d
        d4 <<= e
        d5 <<= f
    x[cuda.grid(1)] = a1 + a2 + a3 + a4 + a5
    x[cuda.grid(1)] += b1 + b2 + b3 + b4 + b5
    x[cuda.grid(1)] += c1 + c2 + c3 + c4 + c5
    x[cuda.grid(1)] += d1 + d2 + d3 + d4 + d5

コード例 #17

0

ファイルを表示

ファイル: cudautils.py プロジェクト: xennygrimmato/pygdf

def gpu_compact_mask_bytes(bools, bits):
    tid = cuda.grid(1)
    base = tid * mask_bitsize
    for i in range(base, base + mask_bitsize):
        if i >= bools.size:
            break
        if bools[i]:
            mask_set(bits, i)

コード例 #18

0

ファイルを表示

ファイル: cudautils.py プロジェクト: xennygrimmato/pygdf

def gpu_insert_if_masked(arr, mask, out_idx, out_queue):
    i = cuda.grid(1)
    if i < arr.size:
        diff = mask[i]
        if diff:
            wridx = cuda.atomic.add(out_idx, 0, 1)
            if wridx < out_queue.size:
                out_queue[wridx] = arr[i]

コード例 #19

0

ファイルを表示

ファイル: test_powi.py プロジェクト: ASPP/numba

def cu_mat_power_binop(A, power, power_A):
    y, x = cuda.grid(2)

    m, n = power_A.shape
    if x >= n or y >= m:
        return

    power_A[y, x] = A[y, x] ** power

コード例 #20

0

ファイルを表示

ファイル: test_powi.py プロジェクト: ASPP/numba

def cu_mat_power(A, power, power_A):
    y, x = cuda.grid(2)

    m, n = power_A.shape
    if x >= n or y >= m:
        return

    power_A[y, x] = math.pow(A[y, x], int32(power))

コード例 #21

0

ファイルを表示

ファイル: test_constmem.py プロジェクト: esc/numba

def cuconstRecAlign(A, B, C, D, E):
    Z = cuda.const.array_like(CONST_RECORD_ALIGN)
    i = cuda.grid(1)
    A[i] = Z[i]['a']
    B[i] = Z[i]['b']
    C[i] = Z[i]['x']
    D[i] = Z[i]['y']
    E[i] = Z[i]['z']

コード例 #22

0

ファイルを表示

ファイル: test_globals.py プロジェクト: ASPP/numba

def simple_smem(ary):
    sm = cuda.shared.array(N, int32)
    i = cuda.grid(1)
    if i == 0:
        for j in range(N):
            sm[j] = j
    cuda.syncthreads()
    ary[i] = sm[i]

コード例 #23

0

ファイルを表示

ファイル: test_random.py プロジェクト: cpcloud/numba

def rng_kernel_float64(states, out, count, distribution):
    thread_id = cuda.grid(1)

    for i in range(count):
        if distribution == UNIFORM:
            out[thread_id * count + i] = xoroshiro128p_uniform_float64(states, thread_id)
        elif distribution == NORMAL:
            out[thread_id * count + i] = xoroshiro128p_normal_float64(states, thread_id)

コード例 #24

0

ファイルを表示

ファイル: saxpy.py プロジェクト: XiaoxiaSun/numbapro-examples

def saxpy(a, x, y, out):
    # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    i = cuda.grid(1)
    # Map i to array elements
    if i >= out.size:
        # Out of range?
        return
        # Do actual work
    out[i] = a * x[i] + y[i]

コード例 #25

0

ファイルを表示

ファイル: gpu_flat.py プロジェクト: ZhouHUB/pyIID

def experimental_sum_grad_fq1(new_grad, grad, k_cov):
    k, qx = cuda.grid(2)
    if k >= len(grad) or qx >= grad.shape[2]:
        return
    i, j = cuda_k_to_ij(i4(k + k_cov))
    for tz in range(3):
        a = grad[k, tz, qx]
        cuda.atomic.add(new_grad, (j, tz, qx), a)
        cuda.atomic.add(new_grad, (i, tz, qx), f4(-1.) * a)

コード例 #26

0

ファイルを表示

def matmul(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

コード例 #27

0

ファイルを表示

ファイル: test_intrinsics.py プロジェクト: ASPP/numba

def intrinsic_forloop_step(c):
    startX, startY = cuda.grid(2)
    gridX = cuda.gridDim.x * cuda.blockDim.x
    gridY = cuda.gridDim.y * cuda.blockDim.y
    height, width = c.shape

    for x in range(startX, width, gridX):
        for y in range(startY, height, gridY):
            c[y, x] = x + y

コード例 #28

0

ファイルを表示

ファイル: test_py2_div_issue.py プロジェクト: ASPP/numba

        def preCalc(y, yA, yB, numDataPoints):
            i = cuda.grid(1)
            k = i % numDataPoints

            ans = float32(1.001 * float32(i))

            y[i] = ans
            yA[i] = ans * 1.0
            yB[i] = ans / 1.0

コード例 #29

0

ファイルを表示

ファイル: all_pairs_euc_dist.py プロジェクト: TheLaueLab/nuc_frames

def cuda_all_euc_dists_inner(coords_arr, out_arr):
    x, y, m = cuda.grid(3)

    num_models, num_beads, num_dims = coords_arr.shape
    if x < num_beads and y < num_beads and m < num_models:
        acc = 0.0
        for d in range(num_dims):
            acc += (coords_arr[m, x, d] - coords_arr[m, y, d]) ** 2
        out_arr[x, y, m] = math.sqrt(acc)

コード例 #30

0

ファイルを表示

ファイル: test_nondet.py プロジェクト: ASPP/numba

        def diagproduct(c, a, b):
            startX, startY = cuda.grid(2)
            gridX = cuda.gridDim.x * cuda.blockDim.x
            gridY = cuda.gridDim.y * cuda.blockDim.y
            height = c.shape[0]
            width = c.shape[1]

            for x in range(startX, width, (gridX)):
                for y in range(startY, height, (gridY)):
                    c[y, x] = a[y, x] * b[x]

コード例 #31

0

ファイルを表示

def math_sqrt(A, B):
    i = cuda.grid(1)
    B[i] = math.sqrt(A[i])

コード例 #32

0

ファイルを表示

def math_acosh(A, B):
    i = cuda.grid(1)
    B[i] = math.acosh(A[i])

コード例 #33

0

ファイルを表示

def useless_syncwarp(ary):
    i = cuda.grid(1)
    cuda.syncwarp()
    ary[i] = i

コード例 #34

0

ファイルを表示

def useless_syncthreads(ary):
    i = cuda.grid(1)
    cuda.syncthreads()
    ary[i] = i

コード例 #35

0

ファイルを表示

def dyn_shared_memory(ary):
    i = cuda.grid(1)
    sm = cuda.shared.array(0, float32)
    sm[i] = i * 2
    cuda.syncthreads()
    ary[i] = sm[i]

コード例 #36

0

ファイルを表示

def math_lgamma(A, B):
    i = cuda.grid(1)
    B[i] = math.lgamma(A[i])

コード例 #37

0

ファイルを表示

def math_atanh(A, B):
    i = cuda.grid(1)
    B[i] = math.atanh(A[i])

コード例 #38

0

ファイルを表示

def math_ceil(A, B):
    i = cuda.grid(1)
    B[i] = math.ceil(A[i])

コード例 #39

0

ファイルを表示

def math_degrees(A, B):
    i = cuda.grid(1)
    B[i] = math.degrees(A[i])

コード例 #40

0

ファイルを表示

def math_radians(A, B):
    i = cuda.grid(1)
    B[i] = math.radians(A[i])

コード例 #41

0

ファイルを表示

def math_pow_binop(A, B, C):
    i = cuda.grid(1)
    C[i] = A[i] ** B[i]

コード例 #42

0

ファイルを表示

ファイル: test_atomics.py プロジェクト: Mahoney0101/AudioCaptureV2

def atomic_compare_and_swap(res, old, ary):
    gid = cuda.grid(1)
    if gid < res.size:
        out = cuda.atomic.compare_and_swap(res[gid:], -99, ary[gid])
        old[gid] = out

コード例 #43

0

ファイルを表示

def math_sin(A, B):
    i = cuda.grid(1)
    B[i] = math.sin(A[i])

コード例 #44

0

ファイルを表示

def math_cos(A, B):
    i = cuda.grid(1)
    B[i] = math.cos(A[i])

コード例 #45

0

ファイルを表示

def math_pow(A, B, C):
    i = cuda.grid(1)
    C[i] = math.pow(A[i], B[i])

コード例 #46

0

ファイルを表示

def math_asinh(A, B):
    i = cuda.grid(1)
    B[i] = math.asinh(A[i])

コード例 #47

0

ファイルを表示

def math_hypot(A, B, C):
    i = cuda.grid(1)
    C[i] = math.hypot(A[i], B[i])

コード例 #48

0

ファイルを表示

def math_isfinite(A, B):
    i = cuda.grid(1)
    B[i] = math.isfinite(A[i])

コード例 #49

0

ファイルを表示

def math_log1p(A, B):
    i = cuda.grid(1)
    B[i] = math.log1p(A[i])

コード例 #50

0

ファイルを表示

def math_isinf(A, B):
    i = cuda.grid(1)
    B[i] = math.isinf(A[i])

コード例 #51

0

ファイルを表示

def math_isnan(A, B):
    i = cuda.grid(1)
    B[i] = math.isnan(A[i])

コード例 #52

0

ファイルを表示

def math_modf(A, B, C):
    i = cuda.grid(1)
    B[i], C[i] = math.modf(A[i])

コード例 #53

0

ファイルを表示

def use_syncthreads_count(ary_in, ary_out):
    i = cuda.grid(1)
    ary_out[i] = cuda.syncthreads_count(ary_in[i])

コード例 #54

0

ファイルを表示

def math_fmod(A, B, C):
    i = cuda.grid(1)
    C[i] = math.fmod(A[i], B[i])

コード例 #55

0

ファイルを表示

def coop_smem2d(ary):
    i, j = cuda.grid(2)
    sm = cuda.shared.array((10, 20), float32)
    sm[i, j] = (i + 1) / (j + 1)
    cuda.syncthreads()
    ary[i, j] = sm[i, j]

コード例 #56

0

ファイルを表示

def math_copysign(A, B, C):
    i = cuda.grid(1)
    C[i] = math.copysign(A[i], B[i])

コード例 #57

0

ファイルを表示

def useless_syncwarp_with_mask(ary):
    i = cuda.grid(1)
    cuda.syncwarp(0xFFFF)
    ary[i] = i

コード例 #58

0

ファイルを表示

def math_floor(A, B):
    i = cuda.grid(1)
    B[i] = math.floor(A[i])

コード例 #59

0

ファイルを表示

ファイル: test_array.py プロジェクト: vsrikarunyan/numba

 def copykernel(x, y):
     i = cuda.grid(1)
     if i < x.shape[0]:
         x[i] = i
         y[i] = i

コード例 #60

0

ファイルを表示

def math_mod_binop(A, B, C):
    i = cuda.grid(1)
    C[i] = A[i] % B[i]