Ejemplo n.º 1
0
def or_reduction(literal, tmp_out, length):
    
    bw = cuda.blockDim.x
    bx = cuda.blockIdx.x
    tid = cuda.threadIdx.x
    shared_list = cuda.shared.array(shape = (tpb), dtype = uint32)
	
   
    i = bx*bw + tid
    shared_list[tid] = 0x00000000
    if i<length:
    	shared_list[tid] = literal[i]

    cuda.syncthreads()

    hop = bw/2
    while hop > 0:
        if tid < hop:
			shared_list[tid] = shared_list[tid] | shared_list[tid+hop]
			#if i <length:
			#	print shared_list[tid]
        cuda.syncthreads()
        hop /= 2
    if tid == 0:
        tmp_out[bx] = shared_list[0]
        print tmp_out[0]
Ejemplo n.º 2
0
def histogramGPU(input_d, bins_d, num_elements):
    private_bin = cuda.shared.array(SM_MAX_SIZE, int32)
    tx = cuda.threadIdx.x
    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    stride = cuda.blockDim.x * cuda.gridDim.x

    location_x = 0
    for i in range(0, ceil(SM_MAX_SIZE / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_UNIQUE_ITEMS and location_x < SM_MAX_SIZE:
            private_bin[location_x] = 0

    cuda.syncthreads()

    element = 0
    while index < num_elements:
        element = input_d[index]
        if element < SM_MAX_SIZE:
            cuda.atomic.add(private_bin, element, 1)
        else:
            cuda.atomic.add(bins_d, element, 1)

        index += stride

    cuda.syncthreads()

    for i in range(0, ceil(SM_MAX_SIZE / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_UNIQUE_ITEMS and location_x < SM_MAX_SIZE:
            cuda.atomic.add(bins_d, location_x, private_bin[location_x])
Ejemplo n.º 3
0
def makeFlistGPU(d_offsets, d_transactions, d_flist, num_transactions,
                 all_items_in_transactions):
    private_items = cuda.shared.array(MAX_SM_ITEMS, uint32)

    tx = cuda.threadIdx.x
    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    location_x = 0
    for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_SM_ITEMS:
            private_items[location_x] = 0

    cuda.syncthreads()

    item_ends = 0
    if index == (num_transactions - 1):
        item_ends = all_items_in_transactions
    elif index < (num_transactions - 1):
        item_ends = d_offsets[index + 1]
    else:
        item_ends = 0

    for i in range(d_offsets[index], item_ends):
        if d_transactions[i] >= 0 and d_transactions[i] < MAX_SM_ITEMS:
            cuda.atomic.add(private_items, d_transactions[i], 1)
        elif d_transactions[i] >= MAX_SM_ITEMS and d_transactions[
                i] < MAX_UNIQUE_ITEMS:
            cuda.atomic.add(d_flist, d_transactions[i], 1)

    cuda.syncthreads()

    for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_SM_ITEMS:
            cuda.atomic.add(d_flist, location_x, private_items[location_x])
def histogramGPU(input_d, bins_d, num_elements):
    private_bin = cuda.shared.array(SM_MAX_SIZE, int32)
    tx = cuda.threadIdx.x
    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    stride = cuda.blockDim.x * cuda.gridDim.x

    location_x = 0
    for i in range(0, ceil(SM_MAX_SIZE / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_UNIQUE_ITEMS and location_x < SM_MAX_SIZE:
            private_bin[location_x] = 0

    cuda.syncthreads()

    element = 0
    while index < num_elements:
        element = input_d[index]
        if element < SM_MAX_SIZE:
            cuda.atomic.add(private_bin, element, 1)
        else:
            cuda.atomic.add(bins_d, element, 1)

        index += stride

    cuda.syncthreads()

    for i in range(0, ceil(SM_MAX_SIZE / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_UNIQUE_ITEMS and location_x < SM_MAX_SIZE:
            cuda.atomic.add(bins_d, location_x, private_bin[location_x])
Ejemplo n.º 5
0
def induced_velocity4(x, xvort, gam, vel):
    smem = cuda.shared.array((blksize, 3), dtype=f8)
    t = cuda.threadIdx.x
    i = cuda.grid(1)
    # eps = 1.e-2
    nvort = xvort.shape[0]
    nx = x.shape[0]
    if i < nx:
        x0 = x[i, 0]
        x1 = x[i, 1]
    xvel = 0
    yvel = 0
    nvort = xvort.shape[0]
    for blk in range((nvort - 1) // blksize + 1):
        # load vortex positions and strengths into shared memory
        j = blk * blksize + t
        if j < nvort:
            smem[t, 0] = xvort[j, 0]
            smem[t, 1] = xvort[j, 1]
            smem[t, 2] = gam[j]
        else:
            smem[t, 0] = 0
            smem[t, 1] = 0
            smem[t, 2] = 0
        cuda.syncthreads()

        # compute the contributions to the velocity
        for k in range(blksize):
            rsq = (x0 - smem[k, 0])**2 + (x1 - smem[k, 1])**2 + eps**2
            xvel += smem[k, 2] * (x1 - smem[k, 1]) / rsq
            yvel += -smem[k, 2] * (x0 - smem[k, 0]) / rsq
        cuda.syncthreads()
    if i < nx:
        vel[i, 0] = xvel
        vel[i, 1] = yvel
Ejemplo n.º 6
0
def cu_square_matrix_mul(A, B, C):
    sA = cuda.shared.array(shape=(tpb, tpb), dtype=f4)
    sB = cuda.shared.array(shape=(tpb, tpb), dtype=f4)
    
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    bw = cuda.blockDim.x
    bh = cuda.blockDim.y

    x = tx + bx * bw
    y = ty + by * bh

    acc = 0.
    for i in range(bpg):
        if x < n and y < n:
            sA[ty, tx] = A[y, tx + i * tpb]
            sB[ty, tx] = B[ty + i * tpb, x]

        cuda.syncthreads()

        if x < n and y < n:
            for j in range(tpb):
                acc += sA[ty, j] * sB[j, tx]

        cuda.syncthreads()

    if x < n and y < n:
        C[y, x] = acc
Ejemplo n.º 7
0
def d_mexpsum(a,b):
	sA = cuda.shared.array(shape=(100,100),dtype=float32)
	xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y
	x,y = cuda.grid(2)
	total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y))
	s = total/2
	if yidx<s:
		sA[xidx,yidx] = math.exp(a[x,y]) + math.exp(a[x,y+s])
	elif yidx+s==total-1:
		cuda.syncthreads()
		sA[xidx,0] += math.exp(a[x,y+s])
	cuda.syncthreads()
	last_s = s
	s = s/2
	while s>0:
		if yidx < s:
			sA[xidx,yidx] += sA[xidx,yidx+s]
		elif yidx+s==last_s-1:
			cuda.syncthreads()
			sA[xidx,0] += sA[xidx,yidx+s]
		cuda.syncthreads()
		last_s = s
		s=s/2
	cuda.syncthreads()
	if yidx == 0:
		b[x,cuda.blockIdx.y] = sA[xidx,yidx]
Ejemplo n.º 8
0
def cu_compute_error(X, Y, Ex, Ey, theta_x, theta_y):
    # Compute error for each element and store in the shared-memory
    Exsm = cuda.shared.array((1024, ), dtype=f8)
    Eysm = cuda.shared.array((1024, ), dtype=f8)

    tid = cuda.threadIdx.x
    base = cuda.blockIdx.x * cuda.blockDim.x

    i = base + tid

    x = X[i]
    y = Y[i]

    predict = theta_x + theta_y * x
    Exsm[tid] = predict - y
    Eysm[tid] = (predict - y) * x

    # Sum-reduce errors in the shared-memory
    n = cuda.blockDim.x
    while n > 1:

        cuda.syncthreads()

        half = n // 2
        if tid < half:
            Exsm[tid] += Exsm[tid + half]
            Eysm[tid] += Eysm[tid + half]

        n = half

    if tid == 0:  # First of a block?
        # Store result
        Ex[cuda.blockIdx.x] = Exsm[0]
        Ey[cuda.blockIdx.x] = Eysm[0]
Ejemplo n.º 9
0
def cu_compute_error(X, Y, Ex, Ey, theta_x, theta_y):
    # Compute error for each element and store in the shared-memory
    Exsm = cuda.shared.array((1024,), dtype=f8)
    Eysm = cuda.shared.array((1024,), dtype=f8)

    tid = cuda.threadIdx.x
    base = cuda.blockIdx.x * cuda.blockDim.x

    i = base + tid

    x = X[i]
    y = Y[i]

    predict = theta_x + theta_y * x
    Exsm[tid] = predict - y
    Eysm[tid] = (predict - y) * x

    # Sum-reduce errors in the shared-memory
    n = cuda.blockDim.x
    while n > 1:

        cuda.syncthreads()

        half = n // 2
        if tid < half:
            Exsm[tid] += Exsm[tid + half]
            Eysm[tid] += Eysm[tid + half]

        n = half


    if tid == 0: # First of a block?
        # Store result
        Ex[cuda.blockIdx.x] = Exsm[0]
        Ey[cuda.blockIdx.x] = Eysm[0]
def jocabi_relax_core(A, Anew, error):
    smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f4)
    n = A.shape[0]
    m = A.shape[1]

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    j = ty + cuda.blockIdx.y * cuda.blockDim.y
    i = tx + cuda.blockIdx.x * cuda.blockDim.x
    
    sy = ty + 1
    sx = tx + 1
    
    smem[sy, sx] = A[j, i]
    if tx == 0 and i >= 1:
        smem[sy, 0] = A[j, i - 1]

    if ty == 0 and j < m - 1:
        smem[0, sx] = A[j - 1, i]

    if tx == 31 and j >= 1:
        smem[sy, 33] = A[j, i + 1]

    if ty == 31 and j < n - 1:
        smem[33, sx] = A[j + 1, i]

    cuda.syncthreads() # ensure smem is visible by all threads in the block
    
    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \
                            + smem[sy - 1, sx] + smem[sy + 1, sx])
        error[j, i] = Anew[j, i] - A[j, i]
def makeFlistGPU(d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions):
    private_items = cuda.shared.array(MAX_SM_ITEMS, uint32)

    tx = cuda.threadIdx.x
    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    location_x = 0
    for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_SM_ITEMS:
            private_items[location_x] = 0

    cuda.syncthreads()

    item_ends = 0
    if index == (num_transactions - 1):
        item_ends = all_items_in_transactions
    elif index < (num_transactions - 1):
        item_ends = d_offsets[index + 1]
    else:
        item_ends = 0

    for i in range(d_offsets[index], item_ends):
        if d_transactions[i] >= 0 and d_transactions[i] < MAX_SM_ITEMS:
            cuda.atomic.add(private_items, d_transactions[i], 1)
        elif d_transactions[i] >= MAX_SM_ITEMS and d_transactions[i] < MAX_UNIQUE_ITEMS:
            cuda.atomic.add(d_flist, d_transactions[i], 1)

    cuda.syncthreads()

    for i in range(0, ceil(MAX_SM_ITEMS / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < MAX_SM_ITEMS:
            cuda.atomic.add(d_flist, location_x, private_items[location_x])
Ejemplo n.º 12
0
def jocabi_relax_core(A, Anew, error):
    smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f4)
    n = A.shape[0]
    m = A.shape[1]

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    j = ty + cuda.blockIdx.y * cuda.blockDim.y
    i = tx + cuda.blockIdx.x * cuda.blockDim.x

    sy = ty + 1
    sx = tx + 1

    smem[sy, sx] = A[j, i]
    if tx == 0 and i >= 1:
        smem[sy, 0] = A[j, i - 1]

    if ty == 0 and j < m - 1:
        smem[0, sx] = A[j - 1, i]

    if tx == 31 and j >= 1:
        smem[sy, 33] = A[j, i + 1]

    if ty == 31 and j < n - 1:
        smem[33, sx] = A[j + 1, i]

    cuda.syncthreads()  # ensure smem is visible by all threads in the block

    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \
                            + smem[sy - 1, sx] + smem[sy + 1, sx])
        error[j, i] = Anew[j, i] - A[j, i]
Ejemplo n.º 13
0
def cu_matmul_sm(A, B, C, n, tpb, bpg):
    # decalre shared memory
    sA = cuda.shared.array(shape=block_dim, dtype=float32)
    sB = cuda.shared.array(shape=block_dim, dtype=float32)

    # we now need the thread ID within a block as well as the global thread ID
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    x, y = cuda.grid(2)

    # pefort partial operations in block-szied tiles
    # saving intermediate values in an accumulator variable
    acc = 0.0
    for i in range(bpg):
        # Stage 1: Prefil shared memory with current block from matrix A and matrix B
        sA[tx, ty] = A[x, ty + i * tpb]
        sB[tx, ty] = B[tx + i * tpb, y]

        # Block calculations till shared mmeory is filled
        cuda.syncthreads()

        # Stage 2: Compute partial dot product and add to accumulator
        if x < n and y < n:
            for j in range(tpb):
                acc += sA[tx, j] * sB[j, ty]

        # Blcok until all threads have completed calcuaiton before next loop iteration
        cuda.syncthreads()

    # Put accumulated dot product into output matrix
    if x < n and y < n:
        C[x, y] = acc
Ejemplo n.º 14
0
def or_reduction(literal, tmp_out, length):

    bw = cuda.blockDim.x
    bx = cuda.blockIdx.x
    tid = cuda.threadIdx.x
    shared_list = cuda.shared.array(shape=(tpb), dtype=uint32)

    i = bx * bw + tid
    shared_list[tid] = 0x00000000
    if i < length:
        shared_list[tid] = literal[i]

    cuda.syncthreads()

    hop = bw / 2
    while hop > 0:
        if tid < hop:
            shared_list[tid] = shared_list[tid] | shared_list[tid + hop]
            #if i <length:
            #	print shared_list[tid]
        cuda.syncthreads()
        hop /= 2
    if tid == 0:
        tmp_out[bx] = shared_list[0]
        print tmp_out[0]
Ejemplo n.º 15
0
def downsweep_phase(zero_list, one_list, hop, base):
    i = cuda.grid(1)
    if i%(2*hop) == (2*hop-1):
        zero_list[i-hop], zero_list[i] = zero_list[i], zero_list[i-hop]+zero_list[i]
        one_list[i-hop], one_list[i] = one_list[i], one_list[i-hop]+one_list[i]
    cuda.syncthreads()
    if hop==1:
        one_list[i] += base
def IndexDefineGPU(in_d, rev_bit_d, in_size, last_input):
    tx = cuda.threadIdx.x
    index = tx + cuda.blockIdx.x * cuda.blockDim.x

    total_falses = in_d[in_size - 1] + last_input

    cuda.syncthreads()

    if index < in_size:
        if rev_bit_d[index] == 0:
            in_d[index] = index + 1 - in_d[index] + total_falses
def IndexDefineGPU(in_d, rev_bit_d, in_size, last_input):
    tx = cuda.threadIdx.x
    index = tx + cuda.blockIdx.x * cuda.blockDim.x

    total_falses = in_d[in_size - 1] + last_input

    cuda.syncthreads()

    if index < in_size:
        if rev_bit_d[index] == 0:
            in_d[index] = index + 1 - in_d[index] + total_falses
Ejemplo n.º 18
0
def findFrequencyGPU(d_transactions, d_offsets, num_transactions, num_elements,
                     dkeyIndex, dMask, num_patterns):
    Ts = cuda.shared.array(SM_SHAPE, int32)
    tx = cuda.threadIdx.x

    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        if tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = -1

    cuda.syncthreads()

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        item_ends = num_elements
        if (trans_index + i + 1) == num_transactions:
            item_ends = num_elements
        elif (trans_index + i + 1) < num_transactions:
            item_ends = d_offsets[trans_index + i + 1]
        else:
            continue
        if (tx + d_offsets[trans_index +
                           i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx]
            #d_transactions[d_offsets[trans_index + i] + tx] += 1

        cuda.syncthreads()

    for mask_id in range(0, int(ceil(num_patterns / 1.0 * cuda.blockDim.x))):
        loop_tx = cuda.threadIdx.x + mask_id * cuda.blockDim.x

        if loop_tx >= num_patterns:
            continue

        for last_seen in range(0, num_patterns):
            if dMask[loop_tx * num_patterns + last_seen] < 0:
                #last_seen += 1
                continue
            item1 = dkeyIndex[loop_tx]
            item2 = dkeyIndex[last_seen]
            for tid in range(0, MAX_TRANSACTIONS_PER_SM):
                flag1 = False
                flag2 = False
                for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS):
                    if Ts[tid, titem] == item1: flag1 = True
                    elif Ts[tid, titem] == item2: flag2 = True

                present_flag = flag1 and flag2
                if present_flag:
                    cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen,
                                    1)
Ejemplo n.º 19
0
def d_msum(a,b):
	sA = cuda.shared.array(shape=(32,32),dtype=float32)
	xidx,yidx = cuda.threadIdx.x,cuda.threadIdx.y
	x,y = cuda.grid(2)
	total = min(cuda.blockDim.y, a.shape[1] - (cuda.blockIdx.y*cuda.blockDim.y))
	s = total/2
	if y+s < a.shape[1]:
		if yidx<s:
			sA[xidx,yidx] = a[x,y] + a[x,y+s]
		cuda.syncthreads()
		if yidx == total-1 and not yidx < s:
			sA[xidx,0] += a[x,y+s]
		cuda.syncthreads()
		last_s = s
		s=s/2
		while s>0:
			if yidx<s:
				sA[xidx,yidx] += sA[xidx,yidx+s]
			cuda.syncthreads()
			if yidx == last_s-1 and not yidx < s:
				sA[xidx,0] += sA[xidx,yidx+s]
			cuda.syncthreads()
			s=s/2
		if yidx==0:
			b[x,cuda.blockIdx.y] = sA[xidx,yidx]
Ejemplo n.º 20
0
def cuda_prefixsum_base2(masks, indices, init, nelem):
    """
    Args
    ----
    nelem:
        Must be power of 2.

    Note
    ----
    Launch 2*nelem threads.  Support 1 block/grid.
    """
    sm = cuda.shared.array((1024,), dtype=numba.int64)
    tid = cuda.threadIdx.x

    # Preload
    if 2 * tid + 1 < nelem:
        sm[2 * tid] = masks[2 * tid]
        sm[2 * tid + 1] = masks[2 * tid + 1]

    # Up phase
    limit = nelem >> 1
    step = 1
    idx = tid * 2
    two_d = 1
    for d in range(3):
        offset = two_d - 1

        if tid < limit:
            sm[offset + idx + step] += sm[offset + idx]

        limit >>= 1
        idx <<= 1
        step <<= 1
        two_d <<= 1

    cuda.syncthreads()

    # Down phase

    if tid == 0:
        sm[nelem - 1] = 0


    cuda.syncthreads()

    # Writeback
    if 2 * tid + 1 < nelem:
        indices[2 * tid] = sm[2 * tid]
        indices[2 * tid + 1] = sm[2 * tid + 1]
def findFrequencyGPU(d_transactions, d_offsets, num_transactions, num_elements, dkeyIndex, dMask, num_patterns):
    Ts = cuda.shared.array(SM_SHAPE, int32)
    tx = cuda.threadIdx.x

    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        if tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = -1

    cuda.syncthreads()

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        item_ends = num_elements
        if (trans_index + i + 1) == num_transactions:
            item_ends = num_elements
        elif (trans_index + i + 1) < num_transactions:
            item_ends = d_offsets[trans_index + i + 1]
        else:
            continue
        if (tx + d_offsets[trans_index + i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx]
            #d_transactions[d_offsets[trans_index + i] + tx] += 1

        cuda.syncthreads()

    for mask_id in range(0, int(ceil(num_patterns / 1.0 * cuda.blockDim.x))):
        loop_tx = cuda.threadIdx.x + mask_id * cuda.blockDim.x

        if loop_tx >= num_patterns:
            continue

        for last_seen in range(0, num_patterns):
            if dMask[loop_tx * num_patterns + last_seen] < 0:
                #last_seen += 1
                continue
            item1 = dkeyIndex[loop_tx]
            item2 = dkeyIndex[last_seen]
            for tid in range(0, MAX_TRANSACTIONS_PER_SM):
                flag1 = False
                flag2 = False
                for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS):
                    if Ts[tid, titem] == item1: flag1 = True
                    elif Ts[tid, titem] == item2: flag2 = True

                present_flag = flag1 and flag2
                if present_flag:
                    cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen, 1)
Ejemplo n.º 22
0
def cuda_prefixsum_base2(masks, indices, init, nelem):
    """
    Args
    ----
    nelem:
        Must be power of 2.

    Note
    ----
    Launch 2*nelem threads.  Support 1 block/grid.
    """
    sm = cuda.shared.array((1024, ), dtype=numba.int64)
    tid = cuda.threadIdx.x

    # Preload
    if 2 * tid + 1 < nelem:
        sm[2 * tid] = masks[2 * tid]
        sm[2 * tid + 1] = masks[2 * tid + 1]

    # Up phase
    limit = nelem >> 1
    step = 1
    idx = tid * 2
    two_d = 1
    for d in range(3):
        offset = two_d - 1

        if tid < limit:
            sm[offset + idx + step] += sm[offset + idx]

        limit >>= 1
        idx <<= 1
        step <<= 1
        two_d <<= 1

    cuda.syncthreads()

    # Down phase

    if tid == 0:
        sm[nelem - 1] = 0

    cuda.syncthreads()

    # Writeback
    if 2 * tid + 1 < nelem:
        indices[2 * tid] = sm[2 * tid]
        indices[2 * tid + 1] = sm[2 * tid + 1]
Ejemplo n.º 23
0
def sum_reduction(zero_list, tmp_out):
    bw = cuda.blockDim.x
    bx = cuda.blockIdx.x
    tid = cuda.threadIdx.x
    shared_list = cuda.shared.array(shape = (TPB_MAX), dtype = int64)
   
    i = bx*bw + tid
    shared_list[tid] = zero_list[i]
    cuda.syncthreads()

    hop = bw/2
    while hop > 0:
        if tid < hop:
            shared_list[tid] = shared_list[tid] + shared_list[tid+hop]
        cuda.syncthreads()
        hop /= 2
    if tid == 0:
        tmp_out[bx] = shared_list[0]
def Blowfish_encipherG(Text, s, p):
	sS = cuda.shared.array(shape=(1024), dtype=uint32)
	sP = cuda.shared.array(shape=(18), dtype=uint32)

	tid = cuda.threadIdx.x
	gid = cuda.grid(1)
	interval = 1024/tpb

	for i in range (tid*interval, (tid+1)*interval, 1):
		sS[i] = s[i]

	cuda.syncthreads()

	if tid<N+2:
		sP[tid] = p[tid]

	cuda.syncthreads()


	if gid*2+1<len(Text)/4 and tid<cuda.blockDim.x:

		xl = Text[gid*2]
		xr = Text[gid*2+1]

		for j in range (0, N, 1):
			xl = xl ^ sP[j]
			x = xl
			d = x & 0x00FF 
			x >>= 8
			c = x & 0x00FF
			x >>= 8
			b = x & 0x00FF
			x >>= 8  
			a = x & 0x00FF
			y = sS[a] + sS[256+b]
			y = y ^ sS[512+c]
			y = y + sS[768+d]
			xr = y ^ xr
			temp = xl
			xl = xr
			xr = temp

		temp = xl
		xl = xr
		xr = temp

		xr = xr ^ sP[N]
		xl = xl ^ sP[N + 1]

		cuda.syncthreads()
		Text[gid*2] = xl
		Text[gid*2+1] = xr

	cuda.syncthreads()
Ejemplo n.º 25
0
def max_kernel(a, b):
    "Simple implementation of reduction kernel"
    # Allocate static shared memory of 256.
    # This limits the maximum block size to 256.
    sa = cuda.shared.array(shape=(256,), dtype=int32)
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    i = tx + bx * bw
    if i < a.shape[0]:
        sa[tx] = a[i]
        if tx == 0:
            # Uses the first thread of each block to perform the actual
            # reduction
            m = sa[tx]
            cuda.syncthreads()
            for j in range(1, bw):
                m = mymax(m, sa[j])
            b[bx] = m
Ejemplo n.º 26
0
def max_kernel(a, b):
    "Simple implementation of reduction kernel"
    # Allocate static shared memory of 256.
    # This limits the maximum block size to 256.
    sa = cuda.shared.array(shape=(256, ), dtype=int32)
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    i = tx + bx * bw
    if i < a.shape[0]:
        sa[tx] = a[i]
        if tx == 0:
            # Uses the first thread of each block to perform the actual
            # reduction
            m = sa[tx]
            cuda.syncthreads()
            for j in range(1, bw):
                m = mymax(m, sa[j])
            b[bx] = m
Ejemplo n.º 27
0
def exclusiveScanGPU(aux_d, out_d, in_d, size):
    private_shared_in = cuda.shared.array(SM_SIZE, uint32)
    start = 2 * cuda.blockDim.x * cuda.blockIdx.x
    tx = cuda.threadIdx.x
    index = tx + start
    ############### Put 2 values per each thread into shared memory ##############
    if index < size:
        private_shared_in[tx] = in_d[index]
    else:
        private_shared_in[tx] = 0.0

    if (index + BLOCK_SIZE) < size:
        private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE]
    else:
        private_shared_in[tx + BLOCK_SIZE] = 0.0

    cuda.syncthreads()
    ########################### Do the first scan ##############################
    d = 1
    while d <= BLOCK_SIZE:
        tk = 2 * d * (tx + 1) - 1
        if tk < (2 * BLOCK_SIZE):
            private_shared_in[tk] += private_shared_in[tk - d]
        d *= 2
        cuda.syncthreads()

    ############################ Do the second scan #############################

    d = BLOCK_SIZE / 2
    while d > 0:
        tk = 2 * d * (tx + 1) - 1
        if (tk + d) < (2 * BLOCK_SIZE):
            private_shared_in[tk + d] += private_shared_in[tk]
        d /= 2
        cuda.syncthreads()

    #############################################################################

    index += 1

    if index < size:
        out_d[index] = private_shared_in[tx]

    if (index + BLOCK_SIZE) < size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE -
                                                             1):
        out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE]

    cuda.syncthreads()

    aux_d[cuda.blockIdx.x] = private_shared_in[2 * BLOCK_SIZE - 1]
    out_d[start] = 0.0
def exclusiveScanGPU(aux_d, out_d, in_d, size):
    private_shared_in = cuda.shared.array(SM_SIZE, uint32)
    start = 2 * cuda.blockDim.x * cuda.blockIdx.x
    tx = cuda.threadIdx.x
    index = tx + start
    ############### Put 2 values per each thread into shared memory ##############
    if index < size:
        private_shared_in[tx] = in_d[index]
    else:
        private_shared_in[tx] = 0.0

    if (index + BLOCK_SIZE) < size:
        private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE]
    else:
        private_shared_in[tx + BLOCK_SIZE] = 0.0

    cuda.syncthreads()
    ########################### Do the first scan ##############################
    d = 1
    while d <= BLOCK_SIZE:
        tk = 2 * d * (tx + 1) - 1
        if tk < (2 * BLOCK_SIZE):
            private_shared_in[tk] += private_shared_in[tk - d]
        d *= 2
        cuda.syncthreads()

    ############################ Do the second scan #############################

    d = BLOCK_SIZE / 2
    while d > 0:
        tk = 2 * d * (tx + 1) - 1
        if (tk + d) < (2 * BLOCK_SIZE):
            private_shared_in[tk + d] += private_shared_in[tk]
        d /= 2
        cuda.syncthreads()

    #############################################################################

    index += 1

    if index < size:
        out_d[index] = private_shared_in[tx]

    if (index + BLOCK_SIZE) < size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE - 1):
        out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE]

    cuda.syncthreads()

    aux_d[cuda.blockIdx.x] = private_shared_in[2 * BLOCK_SIZE - 1]
    out_d[start] = 0.0
Ejemplo n.º 29
0
def histogramGPU(input_d, bins_d, num_elements):
    private_bin = cuda.shared.array(BIN_SIZE, uint32)
    tx = cuda.threadIdx.x
    index = cuda.grid(1) #tx + cuda.blockDim.x * cuda.blockIdx.x
    location_x = 0
    for i in range(0, ceil(BIN_SIZE / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < BIN_SIZE:
            private_bin[location_x] = 0

    cuda.syncthreads()

    if index < num_elements and input_d[index] < BIN_SIZE:
        cuda.atomic.add(private_bin, input_d[index], 1)
        #cuda.atomic.add(bins_d, input_d[index], 1)

    cuda.syncthreads()

    for i in range(0, ceil(BIN_SIZE / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < BIN_SIZE:
            cuda.atomic.add(bins_d, location_x, private_bin[location_x])
Ejemplo n.º 30
0
def d_margmax(a,val,idx):
    x = cuda.grid(1)
    tidx = cuda.threadIdx.x
    bidx = cuda.blockIdx.x
    total = min(cuda.blockDim.x, a.shape[1] - (cuda.blockIdx.x*cuda.blockDim.x))
    if (x < a.shape[1]):
		sVal = cuda.shared.array((1024),dtype=float32)
		sIdx = cuda.shared.array((1024),dtype=float32)
		s = total/2
		if x+s < total:
			if a[0,x] > a[0,x+s]:
				sVal[tidx] = a[0,x]
				sIdx[tidx] = x
			else:
				sVal[tidx] = a[0,x+s]
				sIdx[tidx] = x+s
			cuda.syncthreads()
			if total%2 == 1:
				if tidx == total-1:
					if a[0,x] > a[0,x-tidx]:
						sVal[0] = a[0,x]
						sIdx[0] = x
		s = s/2
		cuda.syncthreads()
		while s > 0:
			if tidx+s < total:
				if sVal[tidx] < sVal[tidx+s]:
					sVal[tidx] = sVal[tidx+s]
					sIdx[tidx] = sIdx[tidx+s]
				cuda.syncthreads()
				if s%2 == 1 and s > 1:
					if tidx == s-1:
						if sVal[0] < sVal[tidx]:
							sVal[0] = sVal[tidx]
							sIdx[0] = sIdx[tidx]
			s = s/2
		cuda.syncthreads()
		if tidx == 0:
			val[0,bidx] = sVal[tidx]
			idx[0,bidx] = sIdx[tidx]	
def jocabi_relax_core(A, Anew, error):
    err_sm = cuda.shared.array((tpb, tpb), dtype=f8)

    ty = cuda.threadIdx.x
    tx = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y

    n = A.shape[0]
    m = A.shape[1]

    i, j = cuda.grid(2)

    err_sm[ty, tx] = 0
    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1] \
                            + A[j - 1, i] + A[j + 1, i])
        err_sm[ty, tx] = Anew[j, i] - A[j, i]

    cuda.syncthreads()

    # max-reduce err_sm vertically
    t = tpb // 2
    while t > 0:
        if ty < t:
            err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx])
        t //= 2
        cuda.syncthreads()

    # max-reduce err_sm horizontally
    t = tpb // 2
    while t > 0:
        if tx < t and ty == 0:
            err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t])
        t //= 2
        cuda.syncthreads()


    if tx == 0 and ty == 0:
        error[by, bx] = err_sm[0, 0]
def jocabi_relax_core(A, Anew, error):
    err_sm = cuda.shared.array((tpb, tpb), dtype=f8)

    ty = cuda.threadIdx.x
    tx = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y

    n = A.shape[0]
    m = A.shape[1]

    i, j = cuda.grid(2)

    err_sm[ty, tx] = 0
    if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
        Anew[j, i] = 0.25 * ( A[j, i + 1] + A[j, i - 1] \
                            + A[j - 1, i] + A[j + 1, i])
        err_sm[ty, tx] = Anew[j, i] - A[j, i]

    cuda.syncthreads()

    # max-reduce err_sm vertically
    t = tpb // 2
    while t > 0:
        if ty < t:
            err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty + t, tx])
        t //= 2
        cuda.syncthreads()

    # max-reduce err_sm horizontally
    t = tpb // 2
    while t > 0:
        if tx < t and ty == 0:
            err_sm[ty, tx] = get_max(err_sm[ty, tx], err_sm[ty, tx + t])
        t //= 2
        cuda.syncthreads()

    if tx == 0 and ty == 0:
        error[by, bx] = err_sm[0, 0]
def findHigherPatternFrequencyGPU(d_transactions, d_offsets, num_transactions, num_elements, dkeyIndex, dMask, num_patterns, api_d, iil_d, power, size_api_d, size_iil_d):
    Ts = cuda.shared.array(SM_SHAPE, int32)
    tx = cuda.threadIdx.x

    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        if tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = -1

    cuda.syncthreads()

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        item_ends = num_elements
        if (trans_index + i + 1) == num_transactions:
            item_ends = num_elements
        elif (trans_index + i + 1) < num_transactions:
            item_ends = d_offsets[trans_index + i + 1]
        else:
            continue
        if (tx + d_offsets[trans_index + i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx]
            #d_transactions[d_offsets[trans_index + i] + tx] += 1

        cuda.syncthreads()

    for mask_id in range(0, int(ceil(num_patterns / (1.0 * BLOCK_SIZE)))):
        loop_tx = tx + mask_id * BLOCK_SIZE
        if loop_tx >= num_patterns:
            continue
        for last_seen in range(0, num_patterns):
            if dMask[loop_tx * num_patterns + last_seen] < 0:
                continue
            hp1 = dkeyIndex[loop_tx]
            hp2 = dkeyIndex[last_seen]

            vitem1 = hp1 % (10 ** power)
            vitem2 = hp2 % (10 ** power)

            if ((vitem1 - 1) * 3 + 1) < size_iil_d and ((vitem2 - 1) * 3 + 1) < size_iil_d:
                index_item1 = iil_d[(vitem1 - 1) * 3 + 1]
                index_item2 = iil_d[(vitem2 - 1) * 3 + 1]

                if index_item1 < size_api_d and index_item2 < size_api_d:
                    item1 = api_d[index_item1]
                    item2 = api_d[index_item2]
                else:
                    continue
            else:
                continue

            vcommon_pattern = hp1 / (10 ** power)

            if ((vcommon_pattern - 1) * 3 + 1) < size_iil_d:
                index_vpat1 = iil_d[(vcommon_pattern - 1) * 3 + 1]
                if index_vpat1 < size_api_d:
                    vpat1 = api_d[index_vpat1]#array of max patterns
                else:
                    continue
            else:
                continue

            for tid in range(0, MAX_TRANSACTIONS_PER_SM):
                flag1 = False
                flag2 = False
                fpat1 = False
                for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS):
                    if Ts[tid, titem] == item1: flag1 = True
                    elif Ts[tid, titem] == item2: flag2 = True
                    elif Ts[tid, titem] == vpat1: fpat1 = True

                present_flag = flag1 and flag2 and fpat1
                if present_flag:
                    cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen, 1)
Ejemplo n.º 34
0
def findHigherPatternFrequencyGPU(d_transactions, d_offsets, num_transactions,
                                  num_elements, dkeyIndex, dMask, num_patterns,
                                  api_d, iil_d, power, size_api_d, size_iil_d):
    Ts = cuda.shared.array(SM_SHAPE, int32)
    tx = cuda.threadIdx.x

    index = tx + cuda.blockDim.x * cuda.blockIdx.x
    trans_index = cuda.blockIdx.x * MAX_TRANSACTIONS_PER_SM

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        if tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = -1

    cuda.syncthreads()

    for i in range(0, MAX_TRANSACTIONS_PER_SM):
        item_ends = num_elements
        if (trans_index + i + 1) == num_transactions:
            item_ends = num_elements
        elif (trans_index + i + 1) < num_transactions:
            item_ends = d_offsets[trans_index + i + 1]
        else:
            continue
        if (tx + d_offsets[trans_index +
                           i]) < item_ends and tx < MAX_ITEMS_PER_TRANSACTIONS:
            Ts[i, tx] = d_transactions[d_offsets[trans_index + i] + tx]
            #d_transactions[d_offsets[trans_index + i] + tx] += 1

        cuda.syncthreads()

    for mask_id in range(0, int(ceil(num_patterns / (1.0 * BLOCK_SIZE)))):
        loop_tx = tx + mask_id * BLOCK_SIZE
        if loop_tx >= num_patterns:
            continue
        for last_seen in range(0, num_patterns):
            if dMask[loop_tx * num_patterns + last_seen] < 0:
                continue
            hp1 = dkeyIndex[loop_tx]
            hp2 = dkeyIndex[last_seen]

            vitem1 = hp1 % (10**power)
            vitem2 = hp2 % (10**power)

            if ((vitem1 - 1) * 3 + 1) < size_iil_d and (
                (vitem2 - 1) * 3 + 1) < size_iil_d:
                index_item1 = iil_d[(vitem1 - 1) * 3 + 1]
                index_item2 = iil_d[(vitem2 - 1) * 3 + 1]

                if index_item1 < size_api_d and index_item2 < size_api_d:
                    item1 = api_d[index_item1]
                    item2 = api_d[index_item2]
                else:
                    continue
            else:
                continue

            vcommon_pattern = hp1 / (10**power)

            if ((vcommon_pattern - 1) * 3 + 1) < size_iil_d:
                index_vpat1 = iil_d[(vcommon_pattern - 1) * 3 + 1]
                if index_vpat1 < size_api_d:
                    vpat1 = api_d[index_vpat1]  #array of max patterns
                else:
                    continue
            else:
                continue

            for tid in range(0, MAX_TRANSACTIONS_PER_SM):
                flag1 = False
                flag2 = False
                fpat1 = False
                for titem in range(0, MAX_ITEMS_PER_TRANSACTIONS):
                    if Ts[tid, titem] == item1: flag1 = True
                    elif Ts[tid, titem] == item2: flag2 = True
                    elif Ts[tid, titem] == vpat1: fpat1 = True

                present_flag = flag1 and flag2 and fpat1
                if present_flag:
                    cuda.atomic.add(dMask, loop_tx * num_patterns + last_seen,
                                    1)
Ejemplo n.º 35
0
def batch_k_selection(A, I, k):
    """QuickSelect
    """
    sampleIdx = cuda.blockIdx.x
    tid = cuda.threadIdx.x

    # XXX: hardcoded array size for maximum capability
    values = cuda.shared.array(shape=1000, dtype=float64)
    indices = cuda.shared.array(shape=1000, dtype=int16)
    storeidx = cuda.shared.array(shape=1, dtype=int32)
    rightidx = cuda.shared.array(shape=1, dtype=int32)

    # Prefill cache
    values[tid] = A[tid, sampleIdx]
    indices[tid] = tid
    cuda.syncthreads()

    st = 0
    rst = 0
    n = A.shape[0]
    left = 0
    right = n - 1
    val = 0.0
    ind = 0
    while left < right:
    # for _ in range(1):
        st = -1
        rst = -1

        pivot = right  #(right + left + 1) // 2

        storeidx[0] = left
        rightidx[0] = 0

        pval = values[pivot]

        # Move pivot to the end
        # if tid == 0:
        #     print(7777777)
        #     print(left + 0)
        #     print(right + 0)
        #     print(pivot + 0)
        # swapf(values, right, pivot)
        # swapi(indices, right, pivot)
        cuda.syncthreads()

        # Compare
        if tid >= left and tid < right:
            val = values[tid]
            ind = indices[tid]
            if val < pval:
                st = cuda.atomic.add(storeidx, 0, 1)
            else:
                rst = cuda.atomic.add(rightidx, 0, 1)

        cuda.syncthreads()
        finalpivot = storeidx[0]

        if rst != -1:
            # Assign right partition index
            st = finalpivot + rst

        # Swap
        if st != -1 and st != tid:
            values[st] = val
            indices[st] = ind

        cuda.syncthreads()

        # Move pivot to final destination
        if tid == 0:
            swapf(values, finalpivot, right)
            swapi(indices, finalpivot, right)

        cuda.syncthreads()

        # Adjust range or done
        remain = n - finalpivot
        if remain == k:
            break
        elif remain > k:
            left = finalpivot + 1
        else:
            right = finalpivot - 1

    if tid < k:
        I[tid, sampleIdx] = indices[n - tid - 1]
Ejemplo n.º 36
0
def selfJoinGPU(input_d, output_d, num_elements, power):
    tx = cuda.threadIdx.x
    #index = tx + cuda.blockIdx.x * cuda.blockDim.x
    start = cuda.blockIdx.x * MAX_ITEM_PER_SM

    sm1 = cuda.shared.array(MAX_ITEM_PER_SM, int32)
    sm2 = cuda.shared.array(MAX_ITEM_PER_SM, int32)

    actual_items_per_sm = num_elements - start

    if actual_items_per_sm >= MAX_ITEM_PER_SM:
        actual_items_per_sm = MAX_ITEM_PER_SM

    for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < actual_items_per_sm and (start +
                                                 location_x) < num_elements:
            sm1[location_x] = input_d[start + location_x]
        else:
            sm1[location_x] = 0
        if cuda.blockIdx.x == 2 and sm1[location_x] == 405:
            print -3
            print location_x
            print -3

    cuda.syncthreads()

    for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
        loop_tx = tx + i * BLOCK_SIZE
        if loop_tx < actual_items_per_sm:
            for j in range(loop_tx + 1, actual_items_per_sm):
                if (sm1[loop_tx] / (10**power)) == (sm1[j] / (10**power)):
                    output_d[(start + loop_tx) * num_elements +
                             (start + j)] = 0
                # else:
                #      output_d[(start + loop_tx) * num_elements + (start + j)] = -1

    cuda.syncthreads()

    # if (cuda.blockIdx.x + 1) < ceil(num_elements / (1.0 * MAX_ITEM_PER_SM)):
    #     pass
    current_smid = 0
    for smid in range(cuda.blockIdx.x + 1,
                      ceil(num_elements / (1.0 * MAX_ITEM_PER_SM))):
        actual_items_per_secondary_sm = num_elements - current_smid * MAX_ITEM_PER_SM - start - MAX_ITEM_PER_SM
        if actual_items_per_secondary_sm > MAX_ITEM_PER_SM:
            actual_items_per_secondary_sm = MAX_ITEM_PER_SM
        for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
            location_x = tx + i * BLOCK_SIZE
            temp = sm1[3]
            if location_x < actual_items_per_secondary_sm and (
                    current_smid * MAX_ITEM_PER_SM + start +
                    location_x) < num_elements:
                if cuda.blockIdx.x == 2 and tx == 0:
                    print 99
                    print sm1[3]
                    print 99
                sm2[location_x] = input_d[(current_smid + 1) * MAX_ITEM_PER_SM
                                          + start + location_x]
            else:
                sm2[location_x] = 0
            if cuda.blockIdx.x == 2 and tx == 0:
                print 100
                print sm1[3]
                print 100
            if cuda.blockIdx.x == 2 and sm2[location_x] == 406:
                print -2
                print location_x
                print sm2[location_x]
                print sm1[0]
                print sm1[1]
                print sm1[2]
                print sm1[3]
                print -2
        cuda.syncthreads()
        if cuda.blockIdx.x == 2:
            sm1[3] = 405
        cuda.syncthreads()

        for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
            loop_tx = tx + i * BLOCK_SIZE

            if sm1[loop_tx] == 405:
                print -1
                print sm2[0]
                print -1

            if loop_tx < actual_items_per_sm:
                j = 0
                while j < actual_items_per_secondary_sm:
                    if (sm1[loop_tx] / (10**power)) == (sm2[j] / (10**power)):
                        output_d[(start + loop_tx) * num_elements +
                                 (current_smid + 1) * MAX_ITEM_PER_SM + start +
                                 j] = 0
                    # else:
                    #     output_d[(start + loop_tx) * num_elements + smid * MAX_ITEM_PER_SM + start + j] = -1
                    j += 1
        current_smid += 1
def RadixGPU(in_d, out_d, in_size):
    private_shared_in = cuda.shared.array(SM_SIZE, uint32)
    private_split = cuda.shared.array(SM_SIZE, uint32)
    private_scan = cuda.shared.array(SM_SIZE, uint32)

    start = 2 * cuda.blockDim.x * cuda.blockIdx.x
    tx = cuda.threadIdx.x
    index = tx + start

    ############### Put 2 values per each thread into shared memory ##############
    if index < in_size:
        private_shared_in[tx] = in_d[index]
    else:
        private_shared_in[tx] = 2 ** (DATA_TYPE - 1) #0xffffffff

    if (index + BLOCK_SIZE) < in_size:
        private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE]
    else:
        private_shared_in[tx + BLOCK_SIZE] = 2 ** (DATA_TYPE - 1) #0xffffffff

    cuda.syncthreads()

    total_falses = 0.0
    t = 0
    f = 0
    bit = 0
    d = 1
    for bit_shift in range(0, DATA_TYPE):
        bit = private_shared_in[tx] & (1 << bit_shift)
        if bit > 0:
            bit = 1
        private_split[tx] = 1 - bit
        private_scan[tx] = 1 - bit

        bit = private_shared_in[tx + BLOCK_SIZE] & (1 << bit_shift)
        if bit > 0:
            bit = 1
        private_split[tx + BLOCK_SIZE] = 1 - bit
        private_scan[tx + BLOCK_SIZE] = 1 - bit

        cuda.syncthreads()

        ########################### Do the first scan ##############################
        d = 1
        while d <= BLOCK_SIZE:
            tk = 2 * d * (tx + 1) - 1
            if tk < (2 * BLOCK_SIZE):
                private_scan[tk] += private_scan[tk - d]
            d *= 2
            cuda.syncthreads()

        ############################ Do the second scan #############################

        d = BLOCK_SIZE / 2
        while d > 0:
            tk = 2 * d * (tx + 1) - 1
            if (tk + d) < (2 * BLOCK_SIZE):
                private_scan[tk + d] += private_scan[tk]
            d /= 2
            cuda.syncthreads()

        #############################################################################
        # temp_index = tx + 1
        # if index < in_size:
        #     private_split_ex[temp_index] = private_split[tx]
        # if (index + BLOCK_SIZE) < in_size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE - 1):
        #     private_split_ex[temp_index + BLOCK_SIZE] = private_split[tx + BLOCK_SIZE]
        # total_falses = private_split[2 * BLOCK_SIZE - 1]
        # private_split_ex[start] = 0.0

        total_falses = private_scan[SM_SIZE - 1]
        t = total_falses
        f = 0
        if tx != 0:
            t = tx - private_scan[tx - 1] + total_falses
            f = private_scan[tx - 1]
        if private_split[tx] == 1:
            private_split[tx] = f
        else:
            private_split[tx] = t

        t = (tx + BLOCK_SIZE) - private_scan[tx + BLOCK_SIZE - 1] + total_falses
        f = private_scan[tx + BLOCK_SIZE - 1]
        if private_split[tx + BLOCK_SIZE] == 1:
            private_split[tx + BLOCK_SIZE] = f
        else:
            private_split[tx + BLOCK_SIZE] = t

        cuda.syncthreads()

        private_scan[private_split[tx]] = private_shared_in[tx]
        private_scan[private_split[tx + BLOCK_SIZE]] = private_shared_in[tx + BLOCK_SIZE]

        cuda.syncthreads()

        private_shared_in[tx] = private_scan[tx]
        private_shared_in[tx + BLOCK_SIZE] = private_scan[tx + BLOCK_SIZE]

        cuda.syncthreads()

    if index < in_size:
        out_d[index] = private_shared_in[tx]

    if (index + BLOCK_SIZE) < in_size:
        out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE]
def selfJoinGPU(input_d, output_d, num_elements, power):
    tx = cuda.threadIdx.x
    #index = tx + cuda.blockIdx.x * cuda.blockDim.x
    start = cuda.blockIdx.x * MAX_ITEM_PER_SM

    sm1 = cuda.shared.array(MAX_ITEM_PER_SM, int32)
    sm2 = cuda.shared.array(MAX_ITEM_PER_SM, int32)

    actual_items_per_sm = num_elements - start

    if actual_items_per_sm >= MAX_ITEM_PER_SM:
        actual_items_per_sm = MAX_ITEM_PER_SM

    for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
        location_x = tx + i * BLOCK_SIZE
        if location_x < actual_items_per_sm and (start + location_x) < num_elements:
            sm1[location_x] = input_d[start + location_x]
        else:
            sm1[location_x] = 0
        if cuda.blockIdx.x == 2 and sm1[location_x] == 405:
            print -3
            print location_x
            print -3

    cuda.syncthreads()


    for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
        loop_tx = tx + i * BLOCK_SIZE
        if loop_tx < actual_items_per_sm:
            for j in range(loop_tx + 1, actual_items_per_sm):
                if (sm1[loop_tx] / (10 ** power)) == (sm1[j] / (10 ** power)):
                    output_d[(start + loop_tx) * num_elements + (start + j)] = 0
                # else:
                #      output_d[(start + loop_tx) * num_elements + (start + j)] = -1

    cuda.syncthreads()

    # if (cuda.blockIdx.x + 1) < ceil(num_elements / (1.0 * MAX_ITEM_PER_SM)):
    #     pass
    current_smid = 0
    for smid in range(cuda.blockIdx.x + 1, ceil(num_elements / (1.0 * MAX_ITEM_PER_SM))):
        actual_items_per_secondary_sm = num_elements - current_smid * MAX_ITEM_PER_SM - start - MAX_ITEM_PER_SM
        if actual_items_per_secondary_sm > MAX_ITEM_PER_SM:
            actual_items_per_secondary_sm = MAX_ITEM_PER_SM
        for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
            location_x = tx + i * BLOCK_SIZE
            temp = sm1[3]
            if location_x < actual_items_per_secondary_sm and (current_smid * MAX_ITEM_PER_SM + start + location_x) < num_elements:
                if cuda.blockIdx.x == 2 and tx == 0:
                    print 99
                    print sm1[3]
                    print 99
                sm2[location_x] = input_d[(current_smid + 1) * MAX_ITEM_PER_SM + start + location_x]
            else:
                sm2[location_x] = 0
            if cuda.blockIdx.x == 2 and tx == 0:
                print 100
                print sm1[3]
                print 100
            if cuda.blockIdx.x == 2 and sm2[location_x] == 406:
                print -2
                print location_x
                print sm2[location_x]
                print sm1[0]
                print sm1[1]
                print sm1[2]
                print sm1[3]
                print -2
        cuda.syncthreads()
        if cuda.blockIdx.x == 2:
            sm1[3] = 405
        cuda.syncthreads()

        for i in range(0, ceil(MAX_ITEM_PER_SM / (1.0 * BLOCK_SIZE))):
            loop_tx = tx + i * BLOCK_SIZE

            if sm1[loop_tx] == 405:
                print -1
                print sm2[0]
                print -1

            if loop_tx < actual_items_per_sm:
                j = 0
                while j < actual_items_per_secondary_sm:
                    if (sm1[loop_tx] / (10 ** power)) == (sm2[j] / (10 ** power)):
                        output_d[(start + loop_tx) * num_elements + (current_smid + 1) * MAX_ITEM_PER_SM + start + j] = 0
                    # else:
                    #     output_d[(start + loop_tx) * num_elements + smid * MAX_ITEM_PER_SM + start + j] = -1
                    j += 1
        current_smid += 1
def RadixGPU(in_d, out_d, in_size):
    private_shared_in = cuda.shared.array(SM_SIZE, uint32)
    private_split = cuda.shared.array(SM_SIZE, uint32)
    private_scan = cuda.shared.array(SM_SIZE, uint32)

    start = 2 * cuda.blockDim.x * cuda.blockIdx.x
    tx = cuda.threadIdx.x
    index = tx + start

    ############### Put 2 values per each thread into shared memory ##############
    if index < in_size:
        private_shared_in[tx] = in_d[index]
    else:
        private_shared_in[tx] = 2**(DATA_TYPE - 1)  #0xffffffff

    if (index + BLOCK_SIZE) < in_size:
        private_shared_in[tx + BLOCK_SIZE] = in_d[index + BLOCK_SIZE]
    else:
        private_shared_in[tx + BLOCK_SIZE] = 2**(DATA_TYPE - 1)  #0xffffffff

    cuda.syncthreads()

    total_falses = 0.0
    t = 0
    f = 0
    bit = 0
    d = 1
    for bit_shift in range(0, DATA_TYPE):
        bit = private_shared_in[tx] & (1 << bit_shift)
        if bit > 0:
            bit = 1
        private_split[tx] = 1 - bit
        private_scan[tx] = 1 - bit

        bit = private_shared_in[tx + BLOCK_SIZE] & (1 << bit_shift)
        if bit > 0:
            bit = 1
        private_split[tx + BLOCK_SIZE] = 1 - bit
        private_scan[tx + BLOCK_SIZE] = 1 - bit

        cuda.syncthreads()

        ########################### Do the first scan ##############################
        d = 1
        while d <= BLOCK_SIZE:
            tk = 2 * d * (tx + 1) - 1
            if tk < (2 * BLOCK_SIZE):
                private_scan[tk] += private_scan[tk - d]
            d *= 2
            cuda.syncthreads()

        ############################ Do the second scan #############################

        d = BLOCK_SIZE / 2
        while d > 0:
            tk = 2 * d * (tx + 1) - 1
            if (tk + d) < (2 * BLOCK_SIZE):
                private_scan[tk + d] += private_scan[tk]
            d /= 2
            cuda.syncthreads()

        #############################################################################
        # temp_index = tx + 1
        # if index < in_size:
        #     private_split_ex[temp_index] = private_split[tx]
        # if (index + BLOCK_SIZE) < in_size and (tx + BLOCK_SIZE) != (2 * BLOCK_SIZE - 1):
        #     private_split_ex[temp_index + BLOCK_SIZE] = private_split[tx + BLOCK_SIZE]
        # total_falses = private_split[2 * BLOCK_SIZE - 1]
        # private_split_ex[start] = 0.0

        total_falses = private_scan[SM_SIZE - 1]
        t = total_falses
        f = 0
        if tx != 0:
            t = tx - private_scan[tx - 1] + total_falses
            f = private_scan[tx - 1]
        if private_split[tx] == 1:
            private_split[tx] = f
        else:
            private_split[tx] = t

        t = (tx + BLOCK_SIZE) - private_scan[tx + BLOCK_SIZE -
                                             1] + total_falses
        f = private_scan[tx + BLOCK_SIZE - 1]
        if private_split[tx + BLOCK_SIZE] == 1:
            private_split[tx + BLOCK_SIZE] = f
        else:
            private_split[tx + BLOCK_SIZE] = t

        cuda.syncthreads()

        private_scan[private_split[tx]] = private_shared_in[tx]
        private_scan[private_split[tx +
                                   BLOCK_SIZE]] = private_shared_in[tx +
                                                                    BLOCK_SIZE]

        cuda.syncthreads()

        private_shared_in[tx] = private_scan[tx]
        private_shared_in[tx + BLOCK_SIZE] = private_scan[tx + BLOCK_SIZE]

        cuda.syncthreads()

    if index < in_size:
        out_d[index] = private_shared_in[tx]

    if (index + BLOCK_SIZE) < in_size:
        out_d[index + BLOCK_SIZE] = private_shared_in[tx + BLOCK_SIZE]