def compute_sample_kernel(factors, longest_wavelet, offsets_per_wavelength, output, num_rows):
    num_wavelengths = longest_wavelet - 2
    output[cuda.gridDim.x] = 0.0
    for row_index in range(num_rows):
        output[cuda.grid(1)] += get_value_gpu(factors, row_index, cuda.gridDim.x, longest_wavelet,
                                              num_wavelengths, offsets_per_wavelength)
    output[cuda.grid(1)] += factors[-1]
Esempio n. 2
0
def vec_add_ilp_x4(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    # write
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl
Esempio n. 3
0
def pruneGPU(input_d, num_elements, min_sup):
    tx = cuda.threadIdx.x
    index = cuda.grid(1)

    if index < num_elements:
        if input_d[index] < min_sup:
            input_d[index] = 0
def c_distribute(rands, low, high):
    i = cuda.grid(1)

    if i >= rands.shape[0]:
        return

    rands[i] = (1.0 - rands[i]) * low + rands[i] * high
Esempio n. 5
0
def abs_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = fabs(a[i,j])
Esempio n. 6
0
def to_convert_to_cuda(lx, ly, numdrops, x, obst_x, y, obst_y, obst_r, fIn,
                       gIn, rho2_0, tNS, cuNS2, jx2, jy2, rho1_0, cuNS1, jx1,
                       jy1):

    j, k = cuda.grid(2)

    if j < lx + 1 and k < ly + 1:
        t = 0
        for z in xrange(numdrops):
            obst_x[z, 0]
            obst_y[z, 0]
            obst_r[z, 0]
            val = (x[j, k] - obst_x[z, 0]) * (x[j, k] - obst_x[z, 0]) + (
                y[j, k] - obst_y[z, 0]) * (y[j, k] - obst_y[z, 0])
            ref = obst_r[z, 0] * obst_r[z, 0]
            if val <= ref:
                fIn[i, j, k] = 0
                gIn[i, j, k] = rho2_0 * tNS[0, i] * (
                    1 + cuNS2[j, k] + (1. / 2) * (cuNS2[j, k] * cuNS2[j, k]) -
                    (3. / 2) * (jx2[j, k] * jx2[j, k] + jy2[j, k] * jy2[j, k]))
                t = t + 1
            elif val > ref and (t < 1):
                fIn[i, j, k] = rho1_0 * tNS[0, i] * (
                    1 + cuNS1[j, k] + 1. / 2 * (cuNS1[j, k] * cuNS1[j, k]) -
                    (3. / 2) * (jx1[j, k] * jx1[j, k] + jy1[j, k] * jy1[j, k]))
                gIn[i, j, k] = 0
                t = t + 1
def saxpy(a, x, y, out):

    i = cuda.grid(
        1)  # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x

    if i < out.size:
        out[i] = a * x[i] + y[i]
Esempio n. 8
0
def mmultiply_pointwise(a,b,out):
    n = a.shape[0]
    m = a.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = a[i,j]*b[i,j]
Esempio n. 9
0
def c_distribute(rands, low, high):
    i = cuda.grid(1)

    if i >= rands.shape[0]:
        return

    rands[i] = (1.0 - rands[i]) * low + rands[i] * high
Esempio n. 10
0
def cu_matmul_sm(A, B, C, n, tpb, bpg):
    # decalre shared memory
    sA = cuda.shared.array(shape=block_dim, dtype=float32)
    sB = cuda.shared.array(shape=block_dim, dtype=float32)

    # we now need the thread ID within a block as well as the global thread ID
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    x, y = cuda.grid(2)

    # pefort partial operations in block-szied tiles
    # saving intermediate values in an accumulator variable
    acc = 0.0
    for i in range(bpg):
        # Stage 1: Prefil shared memory with current block from matrix A and matrix B
        sA[tx, ty] = A[x, ty + i * tpb]
        sB[tx, ty] = B[tx + i * tpb, y]

        # Block calculations till shared mmeory is filled
        cuda.syncthreads()

        # Stage 2: Compute partial dot product and add to accumulator
        if x < n and y < n:
            for j in range(tpb):
                acc += sA[tx, j] * sB[j, ty]

        # Blcok until all threads have completed calcuaiton before next loop iteration
        cuda.syncthreads()

    # Put accumulated dot product into output matrix
    if x < n and y < n:
        C[x, y] = acc
Esempio n. 11
0
def _gaussian_cuda32(fac, n_rep, t, n_t, a_facGo, b_facGo, c_facGo):
	i, j = cuda.grid(2)
	if i >= n_rep or j >= n_t:
		return

	# Fill in 2D fac data structure
	fac[i, j] = a_facGo[i] * exp(-(t[j] - b_facGo[i])**2 /(2 * c_facGo[i]**2))
def pruneGPU(input_d, num_elements, min_sup):
    tx = cuda.threadIdx.x
    index = cuda.grid(1)

    if index < num_elements:
        if input_d[index] < min_sup:
            input_d[index] = 0
Esempio n. 13
0
def produce_chId_lit_gpu(rid, literal, chunk_id, length):
	i = cuda.grid(1)
	if i <length:
		chunk_id[i] = rid[i]/31
		literal[i] = (literal[i]|1<<31) #the left bit set to 1
		off_set = 30-rid[i]%31
		literal[i] = (literal[i]|1<<off_set)
Esempio n. 14
0
def get_reduced(literal, start_pos, reduced_length, reduced_literal, input_data, chunk_id, reduced_input_data, reduced_chunk_id):
	i = cuda.grid(1)
	if i < reduced_length:
		for lit in literal[start_pos[i]:start_pos[i+1]]:
			reduced_literal[i] |= lit
		reduced_input_data[i] = input_data[start_pos[i]]
		reduced_chunk_id[i] = chunk_id[start_pos[i]]
Esempio n. 15
0
def maxPoly(x0, coef, tol, nParam, argMax):

    # Thread IDs
    i = cuda.grid(1)
    
    # The Kernel should only execute if i < nParam
    if i >= nParam:
        return
    else:
        
        # Iterate to convergence
        x = x0
        diff = tol+1
        while diff > tol:
        
            # Compute the first derivative
            firstDeriv = 2*coef[i]*x + 2.3

            # Compute the second derivative
            secondDeriv = 2*coef[i]

            # Newton step
            xNew = x - firstDeriv/secondDeriv

            # Compute difference for convergence check and update
            diff = abs(xNew-x)
            x = xNew

        # Function output
        argMax[i] = x
Esempio n. 16
0
def vec_add_ilp_x4(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    # write 
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl
Esempio n. 17
0
def maxPoly(x0, coef, tol, nParam, argMax):

    # Thread IDs
    i = cuda.grid(1)

    # The Kernel should only execute if i < nParam
    if i >= nParam:
        return
    else:

        # Iterate to convergence
        x = x0
        diff = tol + 1
        while diff > tol:

            # Compute the first derivative
            firstDeriv = 2 * coef[i] * x + 2.3

            # Compute the second derivative
            secondDeriv = 2 * coef[i]

            # Newton step
            xNew = x - firstDeriv / secondDeriv

            # Compute difference for convergence check and update
            diff = abs(xNew - x)
            x = xNew

        # Function output
        argMax[i] = x
def reduce_by_key_gpu(literal, flag, is_finish, hop, length):
	i = cuda.grid(1)
	if i < length-hop:
		if (not is_finish[i]) and (not flag[i+hop]):
			literal[i] |= literal[i+hop]
		else:
			is_finish[i] = 1
Esempio n. 19
0
def kernel(spheres, bitmap):

    x, y = cuda.grid(2)  # alias for threadIdx.x + ( blockIdx.x * blockDim.x ),
    #           threadIdx.y + ( blockIdx.y * blockDim.y )
    # shift the grid to [-DIM/2, DIM/2]
    ox = x - DIM / 2
    oy = y - DIM / 2

    r = 0.
    g = 0.
    b = 0.
    maxz = -INF

    i = 0  # emulate a C-style for-loop, exposing the idx increment logic
    while (i < SPHERES):
        t = hit(ox, oy, spheres[i])
        rad = spheres[i].radius

        if (t > maxz):
            dz = t - spheres[i].z  # t = dz + z; inverting hit() result
            n = dz / sqrt(rad * rad)
            fscale = n  # shades the color to be darker as we recede from
            # the edge of the cube circumscribing the sphere

            r = spheres[i].r * fscale
            g = spheres[i].g * fscale
            b = spheres[i].b * fscale
            maxz = t
        i += 1

    # Save the RGBA value for this particular pixel
    bitmap[x, y, 0] = int(r * 255.)
    bitmap[x, y, 1] = int(g * 255.)
    bitmap[x, y, 2] = int(b * 255.)
    bitmap[x, y, 3] = 255
def produce_fill_gpu(d_head, d_reduced_chunk_id, reduced_chunk_id,
                     reduced_length):
    i = cuda.grid(1)
    if i < reduced_length:
        if not d_head[i]:
            d_reduced_chunk_id[i] = reduced_chunk_id[i] - reduced_chunk_id[
                i - 1] - 1
def getIdx_gpu(fill_word, reduced_literal, index, compact_flag, length):
    i = cuda.grid(1)
    if i < length:
        index[i * 2] = fill_word[i]
        index[i * 2 + 1] = reduced_literal[i]
        if not fill_word[i]:
            compact_flag[i * 2] = 0
Esempio n. 22
0
def induced_velocity4(x, xvort, gam, vel):
    smem = cuda.shared.array((blksize, 3), dtype=f8)
    t = cuda.threadIdx.x
    i = cuda.grid(1)
    # eps = 1.e-2
    nvort = xvort.shape[0]
    nx = x.shape[0]
    if i < nx:
        x0 = x[i, 0]
        x1 = x[i, 1]
    xvel = 0
    yvel = 0
    nvort = xvort.shape[0]
    for blk in range((nvort - 1) // blksize + 1):
        # load vortex positions and strengths into shared memory
        j = blk * blksize + t
        if j < nvort:
            smem[t, 0] = xvort[j, 0]
            smem[t, 1] = xvort[j, 1]
            smem[t, 2] = gam[j]
        else:
            smem[t, 0] = 0
            smem[t, 1] = 0
            smem[t, 2] = 0
        cuda.syncthreads()

        # compute the contributions to the velocity
        for k in range(blksize):
            rsq = (x0 - smem[k, 0])**2 + (x1 - smem[k, 1])**2 + eps**2
            xvel += smem[k, 2] * (x1 - smem[k, 1]) / rsq
            yvel += -smem[k, 2] * (x0 - smem[k, 0]) / rsq
        cuda.syncthreads()
    if i < nx:
        vel[i, 0] = xvel
        vel[i, 1] = yvel
def produce_chId_lit_gpu(rid, literal, chunk_id, length):
    i = cuda.grid(1)
    if i < length:
        chunk_id[i] = rid[i] / 31
        literal[i] = (literal[i] | 1 << 31)  #the left bit set to 1
        off_set = 30 - rid[i] % 31
        literal[i] = (literal[i] | 1 << off_set)
Esempio n. 24
0
def getIdx_gpu(fill_word, reduced_literal, index, compact_flag, length):
	i = cuda.grid(1)
	if i<length:
		index[i*2] = fill_word[i]
		index[i*2+1] = reduced_literal[i]
		if not fill_word[i]:
			compact_flag[i*2] = 0
Esempio n. 25
0
def const_m(out, const):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = const
Esempio n. 26
0
def exp_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = exp(a[i,j])
Esempio n. 27
0
def d_ifog_activate(i,f,o,g):
    x,y = cuda.grid(2)
    if (x<i.shape[0] and y < i.shape[1]):
        	i[x,y] = d_sigmoid(i[x,y])
        	f[x,y] = d_sigmoid(f[x,y])
        	o[x,y] = d_sigmoid(o[x,y])
        	g[x,y] = math.tanh(g[x,y])
Esempio n. 28
0
def d_ifog_build(ifog,i,f,o,g):
    x,y = cuda.grid(2)
    if (x<i.shape[0] and y < i.shape[1]):
        	ifog[x,y] = i[x,y]
        	ifog[x,y+i.shape[1]] = f[x,y]
        	ifog[x,y+(i.shape[1]*2)] = o[x,y]
        	ifog[x,y+(i.shape[1]*3)] = g[x,y]
def kernel(dst, src):
    '''A simple kernel that adds 1 to every item
    '''
    i = cuda.grid(1)
    if i >= dst.shape[0]:
        return
    dst[i] = src[i] + 1
Esempio n. 30
0
def m_mn_sadd_pointwise(a,b,alpha,beta,out):
    n = a.shape[0]
    m = a.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = alpha*a[i,j]+beta*b[i,0]
Esempio n. 31
0
def stateUpdate(currentGrid, nextGrid, cluster, clusterSize, nClust, clusterOnes, enterP, activateP, choiceP, diffuseP, xis, eta):
    x,y = cuda.grid(2)
    i 	= cuda.grid(1)
    gw,gh = currentGrid.shape

    #settings
    pe = 0.0001
    pd = 0.05
    ph = 0.0485/1.5
    
    A = 1.8
    a = 2*A
    h = 0

    if x < gw and y < gh: 
        nextGrid[x,y] = cellUpdate(currentGrid, cluster, clusterSize, nClust, clusterOnes, x, y, i, pe, pd, ph,  A, a, h, enterP, activateP, choiceP, diffuseP, xis, eta)
Esempio n. 32
0
def reduce_by_key_gpu(literal, flag, is_finish, hop, length):
    i = cuda.grid(1)
    if i < length - hop:
        if (not is_finish[i]) and (not flag[i + hop]):
            literal[i] |= literal[i + hop]
        else:
            is_finish[i] = 1
Esempio n. 33
0
def m_mn_add_pointwise(a,b,out):
    n = a.shape[0]
    m = a.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = a[i,j]+b[i,0]
Esempio n. 34
0
def tanh_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = tanh(a[i,j])
Esempio n. 35
0
def d_mexpsum(a,b):
	sA = cuda.shared.array(shape=(100,100),dtype=float32)
	xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y
	x,y = cuda.grid(2)
	total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y))
	s = total/2
	if yidx<s:
		sA[xidx,yidx] = math.exp(a[x,y]) + math.exp(a[x,y+s])
	elif yidx+s==total-1:
		cuda.syncthreads()
		sA[xidx,0] += math.exp(a[x,y+s])
	cuda.syncthreads()
	last_s = s
	s = s/2
	while s>0:
		if yidx < s:
			sA[xidx,yidx] += sA[xidx,yidx+s]
		elif yidx+s==last_s-1:
			cuda.syncthreads()
			sA[xidx,0] += sA[xidx,yidx+s]
		cuda.syncthreads()
		last_s = s
		s=s/2
	cuda.syncthreads()
	if yidx == 0:
		b[x,cuda.blockIdx.y] = sA[xidx,yidx]
def _gaussian_cuda32(fac, fac_Biman, n_rep, t, n_t, a_facGo, b_facGo, c_facGo):
    i, j = cuda.grid(2)
    if i >= n_rep or j >= n_t:
        return

    # Fill in 2D fac data structure
    fac[i, j] = fac_Biman[i, j] + (a_facGo[i] * exp(-(t[j] - b_facGo[i])**2 /(2 * c_facGo[i]**2))) # now adding two arrays together for new fac curve by Hayley
Esempio n. 37
0
def log_m(a, out):
    n = out.shape[0]
    m = out.shape[1]
    i,j = cuda.grid(2)

    if i < n and j < m:
        out[i,j] = log(a[i,j])
Esempio n. 38
0
def d_mclip(a):
    x,y = cuda.grid(2)
    if (x < a.shape[0]) and (y <a.shape[1]):
        if (a[x,y] > 0.9):
        	a[x,y] = 0.9
        if (a[x,y] < -0.9):
        	a[x,y] = -0.9
Esempio n. 39
0
def d_msum(a,b):
	sA = cuda.shared.array(shape=(32,32),dtype=float32)
	xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y
	x,y = cuda.grid(2)
	total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y))
	s = total/2
	if y+s < a.shape[1]:
		if yidx<s:
			sA[xidx,yidx] = a[x,y] + a[x,y+s]
		cuda.syncthreads()
		if yidx == total-1 and not yidx < s:
			sA[xidx,0] += a[x,y+s]
		cuda.syncthreads()
		last_s = s
		s=s/2
		while s>0:
			if yidx<s:
				sA[xidx,yidx] += sA[xidx,yidx+s]
			cuda.syncthreads()
			if yidx == last_s-1 and not yidx < s:
				sA[xidx,0] += sA[xidx,yidx+s]
			cuda.syncthreads()
			s=s/2
		if yidx==0:
			b[x,cuda.blockIdx.y] = sA[xidx,yidx]
def kernel(dst, src):
    '''A simple kernel that adds 1 to every item
    '''
    i = cuda.grid(1)
    if i >= dst.shape[0]:
        return
    dst[i] = src[i] + 1
Esempio n. 41
0
def vec_add_ilp_x8(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    m = l + stride
    am = a[m]
    bm = b[m]

    n = m + stride
    an = a[n]
    bn = b[n]

    p = n + stride
    ap = a[p]
    bp = b[p]

    q = n + stride
    aq = a[q]
    bq = b[q]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    cm = core(am, bm)
    cn = core(an, bn)
    cp = core(ap, bp)
    cq = core(aq, bq)

    # write
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl

    c[m] = cm
    c[n] = cn
    c[p] = cp
    c[q] = cq
Esempio n. 42
0
def vec_add_ilp_x8(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]

    bw = cuda.blockDim.x
    gw = cuda.gridDim.x
    stride = gw * bw

    j = i + stride
    aj = a[j]
    bj = b[j]

    k = j + stride
    ak = a[k]
    bk = b[k]

    l = k + stride
    al = a[l]
    bl = b[l]

    m = l + stride
    am = a[m]
    bm = b[m]

    n = m + stride
    an = a[n]
    bn = b[n]

    p = n + stride
    ap = a[p]
    bp = b[p]

    q = n + stride
    aq = a[q]
    bq = b[q]

    # compute
    ci = core(ai, bi)
    cj = core(aj, bj)
    ck = core(ak, bk)
    cl = core(al, bl)

    cm = core(am, bm)
    cn = core(an, bn)
    cp = core(ap, bp)
    cq = core(aq, bq)

    # write
    c[i] = ci
    c[j] = cj
    c[k] = ck
    c[l] = cl

    c[m] = cm
    c[n] = cn
    c[p] = cp
    c[q] = cq
Esempio n. 43
0
def downsweep_phase(zero_list, one_list, hop, base):
    i = cuda.grid(1)
    if i%(2*hop) == (2*hop-1):
        zero_list[i-hop], zero_list[i] = zero_list[i], zero_list[i-hop]+zero_list[i]
        one_list[i-hop], one_list[i] = one_list[i], one_list[i-hop]+one_list[i]
    cuda.syncthreads()
    if hop==1:
        one_list[i] += base
Esempio n. 44
0
def binit(dictionary, stimuli, b):
    n = stimuli.shape[0]
    m = dictionary.shape[0]
    k = dictionary.shape[1]
    i, j = cuda.grid(2)

    for r in xrange(k):
        b[i, j] += stimuli[i, r] * dictionary[j, r]
Esempio n. 45
0
def d_distances(a,b,val):
    x = cuda.grid(1)
    if (x < b.shape[0]):
		dist = 0.
		for y in range(b.shape[1]):
			dist += d_dist(a[0,y],b[x,y])
		dist = math.sqrt(dist)
		val[0,x] = dist
Esempio n. 46
0
def get_list(arr, length, iter_order, zero_list, one_list):
    i = cuda.grid(1)
    if i < length:
        one_list[i] = (arr[i] >> iter_order) % 2
        zero_list[i] = 1 - one_list[i]
    else:
        one_list[i] = 0
        zero_list[i] = 0
def cuda_match(a, b, c):
    i = cuda.grid(1)
    for j in range (len(b)) :
        if (a[i] == b[j]) :
            c[i] = b[j]
            break
        else :
            c[i] = 0
Esempio n. 48
0
def cinit(dictionary, c):
    n = dictionary.shape[0]
    m = dictionary.shape[1]
    i, j = cuda.grid(2)

    if (i != j):
        for k in xrange(m):
            c[i, j] += dictionary[i, k] * dictionary[j, k]
def produce_flag(
    input_data, chunk_id, length, flag
):  #flag initialized to 0 if a reduced segment start here, flag set to 1
    i = cuda.grid(1)
    if i < length:
        if i == 0 or (input_data[i] != input_data[i - 1]
                      or chunk_id[i] != chunk_id[i - 1]):
            flag[i] = 1
def get_reduced(literal, start_pos, reduced_length, reduced_literal,
                input_data, chunk_id, reduced_input_data, reduced_chunk_id):
    i = cuda.grid(1)
    if i < reduced_length:
        for lit in literal[start_pos[i]:start_pos[i + 1]]:
            reduced_literal[i] |= lit
        reduced_input_data[i] = input_data[start_pos[i]]
        reduced_chunk_id[i] = chunk_id[start_pos[i]]
Esempio n. 51
0
def array_adjust(arr, d_arr,rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length):
    i = cuda.grid(1)
    if i<length:
        if zero_list[i] == 1:
            arr[d_zero_list[i]] = d_arr[i]
            rid[d_zero_list[i]] = d_rid[i]
        else:
            arr[d_one_list[i]] = d_arr[i]
            rid[d_one_list[i]] = d_rid[i]
Esempio n. 52
0
def reduce_phase(zero_list, one_list, hop, thread_num):
    i = cuda.grid(1)
    if i%(2*hop) == (2*hop-1):
        zero_list[i] += zero_list[i-hop]
        one_list[i] += one_list[i-hop]
    if hop == thread_num/2:
        if i == thread_num-1:
            one_list[i] = 0
            zero_list[i] = 0
Esempio n. 53
0
def saxpy(a, x, y, out):
    # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    i = cuda.grid(1)
    # Map i to array elements
    if i >= out.size:
        # Out of range?
        return
    # Do actual work
    out[i] = a * x[i] + y[i]
Esempio n. 54
0
 def division(a, b, c, size):
     """
     Kernel to compute ratio of device arrays a/b
     :param a: (device array)
     :param b: (device array)
     :param c: (device array) contains values of a/b
     :param size: (int) size of device arrays
     """
     i = cuda.grid(1)
     if i < size:
         c[i] = a[i] / b[i]
Esempio n. 55
0
def batch_norm(A, aInorm, K):
    tid = cuda.grid(1)

    if tid >= A.shape[1]:
        return

    sum = float_type(0.0)
    for k in range(K):
        val = A[k, tid]
        sum += val * val

    aInorm[tid] = math.sqrt(sum)
Esempio n. 56
0
def gabs(x, y):
	i = cuda.grid(1)

	if i >= x.size or i >= y.size:
		return

	if x[i] < 0:
		y[i] = -x[i]
	else:
		y[i] = x[i]

	return
Esempio n. 57
0
def norm_random_nums(C, d):
    i = cuda.grid(1)
    if i >= C.shape[1]:
        return

    c = C[:, i]
    sum = float_type(0.0)
    for j in range(d):
        cj = c[j]
        sum += cj * cj
    val = math.sqrt(sum)
    for j in range(d):
        c[j] /= val
Esempio n. 58
0
def induced_velocity3(x, xvort, gam, vel):
    # eps = float32(1.e-2)
    # i, j = cuda.grid(2)
    i = cuda.grid(1)
    if i < x.shape[0]:
        vel[i, 0] = float32(0.)
        vel[i, 1] = float32(0.)
        nvort = xvort.shape[0]
        for j in range(nvort):
            rsq = (x[i, 0] - xvort[j, 0])**2 + (x[i, 1] -
                                                xvort[j, 1])**2 + eps**2
            vel[i, 0] += gam[j] * (x[i, 1] - xvort[j, 1]) / rsq
            vel[i, 1] += -gam[j] * (x[i, 0] - xvort[j, 0]) / rsq
def maxCoefsABS(curCoefs, coefs, coefsd, winners, k):
    i = cuda.grid(1)
    #This is not a great idea. Does cuda do inf? What is largest negative number?
    maxVal = -1.
    maxLoc = -i
    length = curCoefs.shape[1]
    for jj in xrange(length):
        if math.fabs(curCoefs[i, jj]) > maxVal:
            maxVal = math.fabs(curCoefs[i, jj])
            maxLoc = jj
    winners[k, i] = maxLoc
    coefs[i, maxLoc] = curCoefs[i, maxLoc]
    coefsd[i, maxLoc] = curCoefs[i, maxLoc]
Esempio n. 60
0
def cu_monte_carlo_pricer(paths, dt, c0, c1, normdist, seed):
    # short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    i = cuda.grid(1)
    if i >= paths.shape[0]:
        return
    randnum = seed[i]
    for j in range(1, paths.shape[1]):  # foreach time step
        elt = randnum % normdist.shape[0]
        if elt < 0:
            elt = -elt
        noise = normdist[elt]
        paths[i, j] = paths[i, j - 1] * np.exp(c0 * dt + c1 * noise)
        # generate next random number
        randnum = randnum * A + C