def reduce_by_key(input_data, chunk_id, literal, length):#step 3
	flag = numpy.ones(length, dtype='int32')
	stream = cuda.stream()
	d_flag = cuda.to_device(flag, stream)
	d_chunk_id = cuda.to_device(chunk_id, stream)
	d_literal = cuda.to_device(literal, stream)
	produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag)
	d_flag.to_host(stream)
	print 'flag:'
	print flag
	stream.synchronize()	
	is_finish = numpy.zeros(length, dtype='int32')
	hop = 1
	while hop<32:#only 32 because the length of a word in binary form is 32
		reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length)
		hop *= 2
	d_literal.to_host(stream)
	d_chunk_id.to_host(stream)
	stream.synchronize()

	reduced_input_data = []
	reduced_chunk_id = []
	reduced_literal =[]
	for i in xrange(length):
		if flag[i]:
			reduced_input_data.append(input_data[i])
			reduced_chunk_id.append(chunk_id[i])
			reduced_literal.append(literal[i])
	return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
Example #2
0
def tests():
    a = np.random.rand(300,500)
    b = np.random.rand(500,300)

    start = timer()
    c = np.dot(a,b)
    nptime = timer()-start
    print('nptime',nptime)

    x = np.array(np.random.rand(600,1500),dtype='float32',order='F')
    y = np.array(np.random.rand(1500,300),dtype='float32',order='F')
    z = np.zeros((1000,1000),order='F',dtype='float32')

    stream = cuda.stream()

    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dz = cuda.to_device(z)

    start = timer()
    blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz)
    cutime = timer()-start
    print('cutime',cutime)

    #dz.copy_to_host(z)
    print(dz[0])

    c = np.ones((1000,1000),order='F',dtype='float32')
    print(c.shape)
    dc = cuda.to_device(c)

   # blockDim = (256,256)
    #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1]))

    blockDim = (30,30)
    gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1]))

    start = timer()
    mtanh[gridDim,blockDim,stream](dc)
    tantime = timer() - start
    print('tantime',tantime)

    dc.copy_to_host(c,stream=stream)
    stream.synchronize()
    print(c)

    y = cm.CUDAMatrix(np.ones((1000,1000)))

    start = timer()
    cm.tanh(y)
    cmtan = timer()-start
    print('cmtan',cmtan)

    x = cm.CUDAMatrix(np.random.rand(1000,1500))
    y = cm.CUDAMatrix(np.random.rand(1500,1000))

    start = timer()
    cm.dot(x,y)
    cmtime = timer()-start
    print('cmtime',cmtime)
Example #3
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
Example #4
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
Example #5
0
def reduce_by_key(input_data, chunk_id, literal, length):  #step 3
    flag = numpy.ones(length, dtype='int32')
    stream = cuda.stream()
    d_flag = cuda.to_device(flag, stream)
    d_chunk_id = cuda.to_device(chunk_id, stream)
    d_literal = cuda.to_device(literal, stream)
    produce_flag[1, tpb](input_data, d_chunk_id, length, d_flag)
    d_flag.to_host(stream)
    print 'flag:'
    print flag
    stream.synchronize()
    is_finish = numpy.zeros(length, dtype='int32')
    hop = 1
    while hop < 32:  #only 32 because the length of a word in binary form is 32
        reduce_by_key_gpu[1, tpb](d_literal, d_flag, is_finish, hop, length)
        hop *= 2
    d_literal.to_host(stream)
    d_chunk_id.to_host(stream)
    stream.synchronize()

    reduced_input_data = []
    reduced_chunk_id = []
    reduced_literal = []
    for i in xrange(length):
        if flag[i]:
            reduced_input_data.append(input_data[i])
            reduced_chunk_id.append(chunk_id[i])
            reduced_literal.append(literal[i])
    return numpy.array(reduced_input_data), numpy.array(
        reduced_chunk_id), reduced_literal
Example #6
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
Example #7
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    step_cfg = step[gridsz, blksz, stream]
    
    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last, stream=stream)
        d_last = d_paths

    stream.synchronize()
Example #8
0
def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2,bin_length))
    block_num = max(thread_num/TPB_MAX,1)

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    for i in range(iter_num):
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list,stream)
        d_one_list = cuda.to_device(one_list,stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()
        
        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0 #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]

        Blelloch_scan_caller(d_zero_list, d_one_list, base)

        array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
def main():
    NN = 4096
    NM = 4096

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)

    timer = time.time()
    iter = 0

    blockdim = (tpb, tpb)
    griddim = (NN/blockdim[0], NM/blockdim[1])
        
    error_grid = np.zeros(griddim)
    
    stream = cuda.stream()

    dA = cuda.to_device(A, stream)          # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)
    
    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64
        
        jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
        
        derror_grid.to_host(stream)
        
        
        # error_grid is available on host
        stream.synchronize()
        
        error = np.abs(error_grid).max()
        
        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer)

        iter += 1

    runtime = time.time() - timer
    print " total: %f s" % runtime
Example #10
0
def main():
    NN = 4096
    NM = 4096

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)

    timer = time.time()
    iter = 0

    blockdim = (32, 32)
    griddim = (NN / blockdim[0], NM / blockdim[1])

    error_grid = np.zeros_like(A)

    stream = cuda.stream()

    dA = cuda.to_device(A, stream)  # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)  # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)

    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64

        jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

        derror_grid.to_host(stream)

        # error_grid is available on host
        stream.synchronize()

        error = np.abs(error_grid).max()

        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print "%5d, %0.6f (elapsed: %f s)" % (iter, error,
                                                  time.time() - timer)

        iter += 1

    runtime = time.time() - timer
    print " total: %f s" % runtime
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Example #12
0
def get_indexList(path, attr_selected):
    path1, path2, attr_num = bitmap_pickle.get_pic_path(path)
    f1 = open(path1, 'rb')  # read data_map.pkl
    try:
        attr_map = pickle.load(f1)
        attr_list = pickle.load(f1)
        attr_total = pickle.load(f1)
    finally:
        f1.close()

    f2 = open(path2, 'rb')  # read bitmap_pic.pkl
    try:
        lists = pickle.load(f2)
        key = pickle.load(f2)
        offset = pickle.load(f2)
    finally:
        f2.close()

        # attr_input is a list that stores the numbers of input attributes
        # attr_num is the total number of attributes
        # attr_total is the total number of data/31
        attr_input = [[] for i in xrange(attr_num)]
        for i in xrange(attr_num):
            for attri in attr_selected[i]:
                if attri in attr_map[i]:
                    attr_input[i].append(attr_map[i][attri])
                elif attri == 'All':
                    attr_input[i].append(-1)
            if len(attr_input[i]) > 1 and (-1 in attr_input[i]):
                attr_input[i].remove(-1)
        print attr_input

    search_start_time = time.time()
    if len(attr_input
           ) != attr_num:  # there might be a wrong input in input_test.py
        print 'No eligible projects'
    else:
        tpb = 1024
        blocknum = 1
        attr_mul = (attr_total + (tpb * blocknum - 1)) / (tpb * blocknum)
        # attr_mul is the number that each thread need to be performed
        #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul)
        # attr_num = 1
        index_list = numpy.zeros(attr_total * 31, dtype='int32')
        bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key,
                               offset)
        stream = cuda.stream()
        d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream)
        d_index_list = cuda.to_device(numpy.array(index_list), stream)
        index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num,
                                         attr_total, attr_mul)
        index_list = d_index_list.copy_to_host()
        stream.synchronize()
    search_end_time = time.time()
    return index_list, search_end_time - search_start_time
Example #13
0
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")    
    
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib.cublas import Blas  # @UnresolvedImport
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
            x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
            A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
Example #14
0
def main():
    vort = np.array(np.random.rand(2 * n), dtype=dtype).reshape((n, 2))
    gamma = np.array(np.random.rand(n), dtype=dtype)
    vel = np.zeros_like(vort)
    start = timer()
    induced_velocity(vort, vort, gamma, vel)
    numpy_time = timer() - start
    print("n = %d" % n)
    print("Numpy".center(40, "="))
    print("Time: %f seconds" % numpy_time)

    vel2 = np.zeros_like(vort)
    start = timer()
    induced_velocity2(vort, vort, gamma, vel2)
    numba_time = timer() - start
    print("Numba".center(40, "="))
    print("Time: %f seconds" % numba_time)
    error = np.max(np.max(np.abs(vel2 - vel)))
    print("Difference: %f" % error)
    print("Speedup: %f" % (numpy_time / numba_time))

    stream = cuda.stream()
    d_vort = cuda.to_device(vort, stream)
    d_gamma = cuda.to_device(gamma, stream)
    vel3 = np.zeros_like(vort)
    d_vel = cuda.to_device(vel3, stream)
    # blockdim = (32,32)
    # griddim = (n // blockdim[0], n // blockdim[1])
    griddim = (n - 1) // blksize + 1
    start = timer()
    induced_velocity3[griddim, blksize, stream](d_vort, d_vort, d_gamma, d_vel)
    d_vel.to_host(stream)
    gpu_time = timer() - start
    error = np.max(np.max(np.abs(vel3 - vel)))
    print("GPU".center(40, "="))
    print("Time: %f seconds" % gpu_time)
    print("Difference: %f" % error)
    print("Speedup: %f" % (numpy_time / gpu_time))
    # print(vel3)

    vel4 = np.zeros_like(vort)
    d_vel2 = cuda.to_device(vel4, stream)
    start = timer()
    induced_velocity4[griddim, blksize, stream](d_vort, d_vort, d_gamma,
                                                d_vel2)
    d_vel2.to_host(stream)
    gpu2_time = timer() - start
    error = np.max(np.max(np.abs(vel4 - vel)))
    print("GPU smem".center(40, "="))
    print("Time: %f seconds" % gpu2_time)
    print("Difference: %f" % error)
    print("Speedup: %f" % (numpy_time / gpu2_time))
Example #15
0
def get_indexList(path, attr_selected):
    path1, path2, attr_num = bitmap_pickle.get_pic_path(path)
    f1 = open(path1, 'rb')  # read data_map.pkl
    try:
        attr_map = pickle.load(f1)
        attr_list = pickle.load(f1)
        attr_total = pickle.load(f1)
    finally:
        f1.close()

    f2 = open(path2, 'rb')  # read bitmap_pic.pkl
    try:
        lists = pickle.load(f2)
        key = pickle.load(f2)
        offset = pickle.load(f2)
    finally:
        f2.close()

    # attr_input is a list that stores the numbers of input attributes
    # attr_num is the total number of attributes
    # attr_total is the total number of data/31
	attr_input = [[] for i in xrange(attr_num)]
	for i in xrange(attr_num):
		for attri in attr_selected[i]:
			if attri in attr_map[i]:
				attr_input[i].append(attr_map[i][attri])
			elif attri == 'All':
				attr_input[i].append(-1)
		if len(attr_input[i])>1 and (-1 in attr_input[i]):
			attr_input[i].remove(-1)
	print attr_input

    search_start_time = time.time()
    if len(attr_input) != attr_num:  # there might be a wrong input in input_test.py
        print 'No eligible projects'
    else:
        tpb = 1024
        blocknum = 1
        attr_mul = (attr_total + (tpb * blocknum - 1))/(tpb * blocknum)
        # attr_mul is the number that each thread need to be performed
        #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul)
        # attr_num = 1
        index_list = numpy.zeros(attr_total*31, dtype='int32')
        bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset)
        stream = cuda.stream()
        d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream)
        d_index_list = cuda.to_device(numpy.array(index_list), stream)
        index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul)
        index_list = d_index_list.copy_to_host()
        stream.synchronize()
    search_end_time = time.time()
    return index_list, search_end_time-search_start_time
Example #16
0
def main():

    flowtime = 0.1
    nx = 128
    ny = 128
    dx = 2.0 / (nx - 1)
    dy = 2.0 / (ny - 1)

    dt = dx / 50  ##ensures stability for a given mesh fineness

    rho = 1.0
    nu = .1

    nt = int(
        flowtime / dt
    )  ##calculate number of timesteps required to reach a specified total flowtime

    U = numpy.zeros((nx, ny), dtype=numpy.float32)
    U[-1, :] = 1
    V = numpy.zeros((nx, ny), dtype=numpy.float32)
    P = numpy.zeros((ny, nx), dtype=numpy.float32)
    UN = numpy.zeros((nx, ny), dtype=numpy.float32)
    VN = numpy.zeros((nx, ny), dtype=numpy.float32)

    griddim = nx, ny
    blockdim = 768, 768, 1
    #if nx > 767:
    #    griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0]))

    t1 = time.time()
    ###Target the GPU to begin calculation
    stream = cuda.stream()
    d_U = cuda.to_device(U, stream)
    d_V = cuda.to_device(V, stream)
    d_UN = cuda.to_device(UN, stream)
    d_VN = cuda.to_device(VN, stream)

    for i in range(nt):
        P = ppe(rho, dt, dx, dy, U, V, P)
        CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt,
                                         rho, nu)
        d_U.to_host(stream)
        d_V.to_host(stream)
        stream.synchronize()

    t2 = time.time()

    print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2 - t1)
    x = numpy.linspace(0, 2, nx)
    y = numpy.linspace(0, 2, ny)
    Y, X = numpy.meshgrid(y, x)
def produce_fill(reduced_input_data, reduced_chunk_id, reduced_length):#step 4
	head = numpy.ones(reduced_length, dtype='int32')
	stream = cuda.stream()
	d_head = cuda.to_device(head, stream)
	d_reduced_input_data = cuda.to_device(reduced_input_data, stream)
	produce_head[1,tpb](d_reduced_input_data, d_head, reduced_length)#produce head
	d_head.to_host(stream)
	stream.synchronize()
	d_reduced_chunk_id = cuda.to_device(reduced_chunk_id,stream)
	produce_fill_gpu[1,tpb](d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length)
	d_reduced_chunk_id.to_host(stream)
	stream.synchronize()
	#convert to int32 because the range a fill_word can describe is 0~(2^31-1)
	return numpy.array(reduced_chunk_id, dtype='int32')
def main():
    timeintial = time.time()
    OPT_N = 4000000
    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
    stream = cuda.stream()

    ###### Initialize Parameters ######
    strike = 80
    t = 1
    expiry = 10
    spot = 105
    sigma = .3
    rate = .03
    dividend = 0
    # Alpha apparently measures performance compared to the projected performance
    alpha = .69

    # Steps in time
    N = 10
    #Number of simulations
    M = 100

    ## TODO: Figure out what to set dt to
    dt = 1
    Vbar = 0.02
    xi = xi = .025

    N = 100
    M = 2000

    beta1 = -.88
    beta2 = -.42
    beta3 = -.0003

    sigma2 = sigma**2
    alphadt = alpha*dt
    xisdt = xi*np.sqrt(dt)
    erddt = np.exp((rate-dividend)*dt)
    egam1 = np.exp(2*(rate-dividend)*dt)
    egam2 = -2*erddt + 1
    eveg1 = np.exp(-alpha*dt)
    eveg2 = Vbar - Vbar*eveg1

    tau = expiry-t


    VectorizedMonteCarlo[griddim, blockdim, stream](spot, rate, sigma, expiry, N, M, strike, sigma2, Vbar, dt, xi, alpha, dividend, tau)
    stream.synchronize()
Example #19
0
 def __init__(self, gpuID=None, stream=None):
     if gpuID is not None:
         if gpuID < len(cuda.list_devices()) and gpuID >= 0:
             cuda.close()
             cuda.select_device(gpuID)
         else:
             raise ValueError('GPU ID not found')
     if stream is None:
         self.stream = cuda.stream()
     else:
         assert isinstance(stream, numba.cuda.cudadrv.driver.Stream)
         self.stream = stream
     self.blas = numbapro.cudalib.cublas.Blas(stream=self.stream)
     self.blockdim = 32
     self.blockdim2 = (32, 32)
Example #20
0
def main():

    flowtime = 0.1
    nx = 128 
    ny = 128
    dx = 2.0/(nx-1)
    dy = 2.0/(ny-1)

    dt = dx/50 ##ensures stability for a given mesh fineness
    
    rho = 1.0
    nu =.1 

    nt = int(flowtime/dt) ##calculate number of timesteps required to reach a specified total flowtime

    U = numpy.zeros((nx,ny), dtype=numpy.float32)
    U[-1,:] = 1
    V = numpy.zeros((nx,ny), dtype=numpy.float32)
    P = numpy.zeros((ny, nx), dtype=numpy.float32)
    UN = numpy.zeros((nx,ny), dtype=numpy.float32)
    VN = numpy.zeros((nx,ny), dtype=numpy.float32)

    griddim = nx, ny
    blockdim = 768, 768, 1
    #if nx > 767:
    #    griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0]))

    t1 = time.time()    
    ###Target the GPU to begin calculation
    stream = cuda.stream()
    d_U = cuda.to_device(U, stream)
    d_V = cuda.to_device(V, stream)
    d_UN = cuda.to_device(UN, stream)
    d_VN = cuda.to_device(VN, stream)

    for i in range(nt):
        P = ppe(rho, dt, dx, dy, U, V, P)
        CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt, rho, nu)
        d_U.to_host(stream)
        d_V.to_host(stream)
        stream.synchronize()

    t2 = time.time()

    print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2-t1)
    x = numpy.linspace(0,2,nx)
    y = numpy.linspace(0,2,ny)
    Y,X = numpy.meshgrid(y,x)
Example #21
0
def spca_simpler(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    numSamples = int(math.ceil((4. / epsilon)**d))
    print(numSamples)
    ##actual algorithm
    opt_x = np.zeros((p, 1))
    opt_v = -np.inf

    # Prepare CUDA
    prng = curand.PRNG()
    custr = cuda.stream()

    #GENERATE ALL RANDOM SAMPLES BEFORE
    # C = np.random.randn(d, numSamples).astype(float_dtype)
    C = np.empty((d, numSamples), dtype=float_dtype)
    prng.normal(C.ravel(), mean=0, sigma=1)

    sorter = RadixSort(maxcount=Vd.shape[0],
                       dtype=Vd.dtype,
                       stream=custr,
                       descending=True)

    for i in range(1, numSamples + 1):

        #c = np.random.randn(d,1)
        #c = C[:,i-1]
        c = C[:, i - 1:i]
        c = c / np.linalg.norm(c)
        a = Vd.dot(c)

        #partial argsort in numpy?
        #if partial, kth largest is p-k th smallest
        #but need indices more than partial

        # I = np.argsort(a, axis=0)
        # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest

        # I = sorter.argselect(a[:, 0], k=k, reverse=True)
        I = sorter.argselect(k, a[:, 0])

        val = np.linalg.norm(a[:k])  #index to get k largest

        if val > opt_v:
            opt_v = val
            opt_x = np.zeros((p, 1), dtype=float_dtype)
            opt_x[I] = a[:k] / val

    return opt_x
Example #22
0
def produce_fill(reduced_input_data, reduced_chunk_id,
                 reduced_length):  #step 4
    head = numpy.ones(reduced_length, dtype='int32')
    stream = cuda.stream()
    d_head = cuda.to_device(head, stream)
    d_reduced_input_data = cuda.to_device(reduced_input_data, stream)
    produce_head[1, tpb](d_reduced_input_data, d_head,
                         reduced_length)  #produce head
    d_head.to_host(stream)
    stream.synchronize()
    d_reduced_chunk_id = cuda.to_device(reduced_chunk_id, stream)
    produce_fill_gpu[1, tpb](d_head, d_reduced_chunk_id, reduced_chunk_id,
                             reduced_length)
    d_reduced_chunk_id.to_host(stream)
    stream.synchronize()
    #convert to int32 because the range a fill_word can describe is 0~(2^31-1)
    return numpy.array(reduced_chunk_id, dtype='int32')
Example #23
0
def spca_simpler(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    numSamples = int(math.ceil((4. / epsilon) ** d))
    print(numSamples)
    ##actual algorithm
    opt_x = np.zeros((p, 1))
    opt_v = -np.inf

    # Prepare CUDA
    prng = curand.PRNG()
    custr = cuda.stream()

    #GENERATE ALL RANDOM SAMPLES BEFORE
    # C = np.random.randn(d, numSamples).astype(float_dtype)
    C = np.empty((d, numSamples), dtype=float_dtype)
    prng.normal(C.ravel(), mean=0, sigma=1)

    sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
                       descending=True)

    for i in range(1, numSamples + 1):

        #c = np.random.randn(d,1)
        #c = C[:,i-1]
        c = C[:, i - 1:i]
        c = c / np.linalg.norm(c)
        a = Vd.dot(c)

        #partial argsort in numpy?
        #if partial, kth largest is p-k th smallest
        #but need indices more than partial

        # I = np.argsort(a, axis=0)
        # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest

        # I = sorter.argselect(a[:, 0], k=k, reverse=True)
        I = sorter.argselect(k, a[:, 0])

        val = np.linalg.norm(a[:k]) #index to get k largest

        if val > opt_v:
            opt_v = val
            opt_x = np.zeros((p, 1), dtype=float_dtype)
            opt_x[I] = a[:k] / val

    return opt_x
Example #24
0
    def compute_block(self):

        device_uniforms = curand.uniform(size=N * N, device=True)
        host_results = zeros((self.size, self.size))

        stream = cuda.stream()
        device_proposals = cuda.to_device(self.host_proposals, stream=stream)
        device_omegas = cuda.to_device(self.host_omegas, stream=stream)
        device_results = cuda.device_array_like(host_results, stream=stream)
        cu_one_block[self.grid_dim, self.threads_per_block,
                     stream](self.start, device_proposals, device_omegas,
                             device_uniforms, device_results, self.size,
                             self.size)
        device_results.copy_to_host(host_results, stream=stream)

        stream.synchronize()

        return host_results
def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length - 1)), len(
        bin(TPB_MAX - 1)))  #the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2, bin_length))
    block_num = max(thread_num / TPB_MAX, 1)

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    for i in range(iter_num):
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list, stream)
        d_one_list = cuda.to_device(one_list, stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list,
                                     d_one_list)  #get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()

        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num,
                      base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0  #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]

        Blelloch_scan_caller(d_zero_list, d_one_list, base)

        array_adjust[block_num,
                     TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list,
                              d_zero_list, d_one_list, length)
def block_increment(start, n):

    cuda.select_device(0)
    stream = cuda.stream()
    blockdim = 256
    griddim = n // 256 + 1
    c_host = np.zeros((n, n), dtype=np.float32)
    m_dev = curand.normal(0, 1, n, dtype=np.float32, device=True)
    n_dev = curand.normal(0, 1, n, dtype=np.float32, device=True)
    a_host = np.zeros(n, dtype=np.float32)
    a_dev = cuda.device_array_like(a_host)
    cuda_div[griddim, blockdim, stream](m_dev, n_dev, a_dev, n)
    #keeps a_dev on the device for the kernel ==> no access at this point to the device memory
    # so i cant know what appends to m_dev and n_dev best guess is python GC is
    # translated into desallocation on the device
    b_dev = curand.uniform((n * n), dtype=np.float32, device=True)
    c_dev = cuda.device_array_like(c_host, stream)
    block_kernel[griddim, blockdim, stream](start, n, a_dev, b_dev, c_dev)
    c_dev.copy_to_host(c_host, stream)
    stream.synchronize()

    return c_host
def mc_cuda(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # instantiate a CUDA stream for queueing async CUDA cmds
    stream = cuda.stream()
    # instantiate a cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # configure the kernel
    # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>>
    step_cfg = step_cuda[gridsz, blksz, stream]
    
    # transfer the initial prices
    d_last = cuda.to_device(paths[:, 0], stream=stream)
    for j in range(1, paths.shape[1]):
        # call cuRAND to populate d_normdist with gaussian noises
        prng.normal(d_normdist, mean=0, sigma=1)
        # setup memory for new prices
        # device_array_like is like empty_like for GPU
        d_paths = cuda.device_array_like(paths[:, j], stream=stream)
        # invoke step kernel asynchronously
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        # transfer memory back to the host
        d_paths.copy_to_host(paths[:, j], stream=stream)
        d_last = d_paths
    # wait for all GPU work to complete
    stream.synchronize()
Example #28
0
def score_sequence(seq, pssm, verbose = False, keep_strands = True, benchmark = False, blocks_per_grid = -1, threads_per_block = -1):
    """
    This function will score a sequence of nucleotides based on a PSSM by using
    a sliding window parallelized on a GPU.
    
    Args:
        seq: This must be an integer representation of the nucleotide sequence,
            where the alphabet is (A = 0, C = 1, G = 2, T = 3). It must be a 
            vector (1D array) of integers that can be cast to int32 (See: 
            numpy.int32).
        pssm: This must a vectorized PSSM where every four elements correspond 
            to one position. Make sure this can be cast to an array of float64.
        verbose: Set this to True to print performance information.
        benchmark: If set to True, the function will return information about
            the run in a dictionary at the third output variable.
        keep_strands: Whether memory should be allocated for storing which
            strand the scores come from. Set this to False if you just want the
            scores and the strands array will not be returned.
            NOTE: If this and benchmark are set to False, then the scores will
            not be returned in a tuple, meaning:
                >>> score_sequence
        blocks_per_grid: This is the blocks per grid that will be assigned to 
            the CUDA kernel. See this SO question for info on choosing this
            value: http://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid
            It defaults to the length of the sequence or the maximum number of
            blocks per grid supported by the GPU, whichever is lower.
            Set this to a negative number
        threads_per_block: Threads per block. See above. It defaults to 55% of
            the maximum number of threads per block supported by the GPU, a
            value determined experimentally. Higher values will likely result
            in failure to allocate resources to the kernel (since there will
            not be enough register space for each thread).
        
    Returns:
        scores: 1D float64 array of length (n - w + 1), where n is the length
            of the sequence and w is the window size. The value at index i of
            this array corresponds to the score of the n-mer at position i in 
            the sequence.
        strands: 1D int32 array of length (n - w + 1). The value at position i
            is either 0 or 1 corresponding to the strand of the score at that
            position where 0 means the forward strand and 1 means reverse.
        run_info: This is a dictionary that is returned if the benchmark
            parameter is set to True. It contains the following:
            >>> run_info.keys()
            ['memory_used', 'genome_size', 'runtime', 'threads_per_block', 'blocks_per_grid']
            Note that the memory_used is rather misleading if running the
            function more than once. CUDA is optimized to not transfer the same
            data from the host to the device so it will not always change. It
            may also unload other assets from memory, so the memory changed can
            be negative.
            TODO: Find a better method of calculating memory usage.
            
    Example:
        >>> pssm = np.random.uniform(-7.5, 2.0, 4 * 16) # Window size of 16
        >>> seq = np.random.randint(0, 3, 30e6) # Generate random 30 million bp sequence
        >>> scores, strands, run_info = score_sequence(seq, pssm, benchmark=True, verbose=True)
        Threads per block = 563
        Blocks per grid = 53286
        Total threads = 30000018
        Scoring... Done.
        Genome size: 3e+07 bp
        Time: 605.78 ms
        Speed: 4.95229e+07 bp/sec
        >>> scores
        array([-16.97089798, -33.48925866, -21.80381526, ..., -10.27919401,
               -32.64575614, -23.97110103])
        >>> strands
        array([1, 1, 1, ..., 1, 1, 0])
        >>> run_info
        {'memory_used': 426508288L, 'genome_size': 30000000, 'runtime': 0.28268090518054123, 'threads_per_block': 563, 'blocks_per_grid': 53286}
        
    A more interesting interpretation of the run information for performance 
    analysis is the number of bases score per second:
        >>> print "%g bases/sec" % run_info["genome_size"] / run_info["runtime"]
        1.06127e+08 bases/sec
    """
    w = int(pssm.size / 4) # width of PSSM
    n = int(seq.size) # length of the sequence being scored
    
    # Calculate the reverse-complement of the PSSM
    pssm_r = np.array([pssm[i / 4 + (3 - (i % 4))] for i in range(pssm.size)][::-1])

    # Calculate the appropriate threads per block and blocks per grid    
    if threads_per_block <= 0 or blocks_per_grid <= 0:
        # We don't use the max number of threads to avoid running out of
        # register space by saturating the streaming multiprocessors
        # ~55% was found empirically, but your mileage may vary with different GPUs
        threads_per_block = int(cuda.get_current_device().MAX_BLOCK_DIM_X * 0.55)
        
        # We saturate our grid and let the dynamic scheduler assign the blocks
        # to the discrete CUDA cores/streaming multiprocessors
        blocks_per_grid = int(math.ceil(float(n) / threads_per_block))
        if blocks_per_grid > cuda.get_current_device().MAX_GRID_DIM_X:
            blocks_per_grid = cuda.get_current_device().MAX_GRID_DIM_X
    
    if verbose:
        print "Threads per block = %d" % threads_per_block
        print "Blocks per grid = %d" % blocks_per_grid
        print "Total threads = %d" % (threads_per_block * blocks_per_grid)
    
    # Collect benchmarking info
    s = default_timer()
    start_mem = cuda.current_context().get_memory_info()[0]
    
    # Start a stream
    stream = cuda.stream()
    
    # Copy data to device
    d_pssm = cuda.to_device(pssm.astype(np.float64), stream)
    d_pssm_r = cuda.to_device(pssm_r.astype(np.float64), stream)
    d_seq = cuda.to_device(seq.astype(np.int32), stream)
    
    # Allocate memory on device to store results
    d_scores = cuda.device_array(n - w + 1, dtype=np.float64, stream=stream)
    if keep_strands:
        d_strands = cuda.device_array(n - w + 1, dtype=np.int32, stream=stream)
        
    # Run the kernel
    if keep_strands:
        cuda_score[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores, d_strands)
    else:
        cuda_score_without_strands[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores)
    
    # Copy results back to host
    scores = d_scores.copy_to_host(stream=stream)
    if keep_strands:
        strands = d_strands.copy_to_host(stream=stream)
    stream.synchronize()
    
    # Collect benchmarking info
    end_mem = cuda.current_context().get_memory_info()[0]
    t = default_timer() - s
    
    # Output info on the run if verbose parameter is true
    if verbose:
        print "Genome size: %g bp" % n
        print "Time: %.2f ms (using time.%s())" % (t * 1000, default_timer.__name__)
        print "Speed: %g bp/sec" % (n / t)
        print "Global memory: %d bytes used (%.2f%% of total)" % \
            (start_mem - end_mem, float(start_mem - end_mem) * 100 / cuda.get_current_device().get_context().get_memory_info()[1])
    
    # Return the run information for benchmarking
    run_info = {"genome_size": n, "runtime": t, "memory_used": start_mem - end_mem, \
                "blocks_per_grid": blocks_per_grid, "threads_per_block": threads_per_block}
                
    # I'm so sorry BDFL, please don't hunt me down for returning different size
    # tuples in my function
    if keep_strands:
        if benchmark:
            return (scores, strands, run_info)
        else:
            return (scores, strands)
    else:
        if benchmark:
            return (scores, run_info)
        else:
            # Careful! This won't return a tuple, so you don't need to do
            # score_sequence[0] to get the scores
            return scores
def get_trials(params, n_rep=100000):
    '''
    Generates n_rep number of facilitation curves for Go response for all simulated trials required
    
    Parameters
    -------------
    params : sequence (4,) of float
        k_facGo - scale of fac curve
        pre_t_mean - average start time before target presentation
        pre_t_sd - standard deviation of start time before target
        
        Returns
        --------
        fac_i : array
            facilitation curves for all simulated trials
        t : array
            sequence of time index
    '''
    pre_t_mean, pre_t_sd = params 
    k_facGo = 0.004
    tau_facGo = 1.69
    inhib_mean = 1.57
    inhib_sd = 0.31
    t = np.linspace(-.4, .2, 600, endpoint=False, dtype=np.float32)  
    pre_t = np.array(np.random.normal(pre_t_mean, pre_t_sd, size=n_rep), dtype=np.float32)
    fac_i_parallel = np.zeros((n_rep, t.size), dtype=np.float32)
    inhib_tonic_parallel = np.zeros((n_rep, t.size))    
    inhib_parallel = np.random.normal(inhib_mean, inhib_sd, size=n_rep)
    inhib_tonic_parallel += inhib_parallel[:,np.newaxis]
   
    if PAR_TEST:
        fac_i = np.zeros((n_rep, t.size), dtype=np.float32) 
        
        t_start = time()
        for i in range(n_rep):  # for each simulated trial
            myparams = pre_t[i] #k_facGo, tau_facGo, 
            #fac_i[i] = get_fac(t, myparams) 
            #fac_i[i] = fast.get_fac(t, myparams) 
        t_end = time()  
        s_time = t_end - t_start
        print "Serial time: %.3f s" % s_time

    # Used for testing get_fac_parallel, it will fill the array fac_i_parallel
    #get_fac_parallel(fac_i_parallel, n_rep, t, len(t), k_facGo, tau_facGo, pre_t)

	# Setup CUDA variables
    tpb_x = 8 # threads per block in x dimension
    tpb_y = 8 # threads per block in y dimension
    block_dim = tpb_x, tpb_y
    bpg_x = int(n_rep / tpb_x) + 1 # block grid x dimension
    bpg_y = int(t.size / tpb_y) + 1 # block grid y dimension
    grid_dim = bpg_x, bpg_y
	
    t_start = time()
    stream = cuda.stream()
    with stream.auto_synchronize():
        d_fac = cuda.to_device(fac_i_parallel, stream)
        d_t = cuda.to_device(t, stream)
        d_pre_t = cuda.to_device(pre_t, stream)
        #d_inhib_tonic = cuda.to_device(inhib_tonic_parallel, stream)        
        print "CUDA kernel: Block dim: ({tx}, {ty}), Grid dim: ({gx}, {gy})".format(tx=tpb_x, ty=tpb_y, gx=bpg_x, gy=bpg_y)
        get_fac_cuda[grid_dim, block_dim](d_fac, n_rep, t, len(t), k_facGo, tau_facGo, pre_t) #k_facGo, tau_facGo, removed - defined in get_fac_cuda function input argument
        #get_inhib_tonic_cuda[]
        d_fac.to_host(stream)
    t_end = time()  
    c_time = t_end - t_start
    print "CUDA time: %.3f s" % c_time

    if PAR_TEST:
        print "Difference between fac_i and fac_i_parallel"
        print (fac_i - fac_i_parallel)
        print "Close enough? ", np.allclose(fac_i, fac_i_parallel, rtol=0, atol=1e-05)
        print "Speed up: %.3f x" % (s_time / c_time)

    return fac_i_parallel, inhib_tonic_parallel, t 
Example #30
0
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print 'regular', te - ts

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
    ts = timer()
    stream = cuda.stream()  # use stream to trigger async memory transfer
    d_src = cuda.to_device(src, stream=stream)
    d_dst = cuda.device_array_like(dst, stream=stream)

    copy_kernel(d_src, out=d_dst, stream=stream)

    d_dst.copy_to_host(dst, stream=stream)
    stream.synchronize()
    te = timer()
    print 'pinned', te - ts

assert np.allclose(dst, src)
Example #31
0
def main(*args):
    OPT_N = 4000000
    iterations = 10

    if len(args) >= 2:
        iterations = int(args[0])

    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1

    # Use cuRand to generate random numbers directyl on the gpu
    # to avoid memory transfers.
    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)

    time0 = time.time()

    # malloc
    d_stockPrice = cuda.device_array(shape=(OPT_N), dtype=np.float32)
    d_optionStrike = cuda.device_array(shape=(OPT_N), dtype=np.float32)
    d_optionYears = cuda.device_array(shape=(OPT_N), dtype=np.float32)

    # Base distribution
    prng.uniform(d_stockPrice)
    prng.uniform(d_optionStrike)
    prng.uniform(d_optionYears)

    stream = cuda.stream()

    cfg_distribute = c_distribute[griddim, blockdim, stream]

    cfg_distribute(d_stockPrice, 5.0, 30.0)
    cfg_distribute(d_optionStrike, 1.0, 100.0)
    cfg_distribute(d_optionYears, 0.25, 10.)

    stream.synchronize()

    callResultNumbapro = np.zeros(OPT_N)
    putResultNumbapro = -np.ones(OPT_N)

    d_callResult = cuda.to_device(callResultNumbapro, stream)
    d_putResult = cuda.to_device(putResultNumbapro, stream)

    time1 = time.time()

    # Preconfigure the kernel as it's called multiple times in a loop.
    cfg_black_scholes_cuda = black_scholes_cuda[griddim, blockdim, stream]

    for i in range(iterations):
        cfg_black_scholes_cuda(d_callResult, d_putResult, d_stockPrice,
                               d_optionStrike, d_optionYears, RISKFREE,
                               VOLATILITY)

        d_callResult.to_host(stream)
        d_putResult.to_host(stream)

        stream.synchronize()

    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)

    print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))
def main (*args):
    OPT_N = 4000000
    iterations = 10
    if len(args) >= 2:
        iterations = int(args[0])

    callResultNumpy = np.zeros(OPT_N)
    putResultNumpy = -np.ones(OPT_N)
    stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
    optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
    optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)
    callResultNumba = np.zeros(OPT_N)
    putResultNumba = -np.ones(OPT_N)
    callResultNumbapro = np.zeros(OPT_N)
    putResultNumbapro = -np.ones(OPT_N)

    time0 = time.time()
    for i in range(iterations):
        black_scholes(callResultNumpy, putResultNumpy, stockPrice,
                      optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numpy Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    for i in range(iterations):
        black_scholes_numba(callResultNumba, putResultNumba, stockPrice,
                            optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numba Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
    stream = cuda.stream()
    d_callResult = cuda.to_device(callResultNumbapro, stream)
    d_putResult = cuda.to_device(putResultNumbapro, stream)
    d_stockPrice = cuda.to_device(stockPrice, stream)
    d_optionStrike = cuda.to_device(optionStrike, stream)
    d_optionYears = cuda.to_device(optionYears, stream)
    time1 = time.time()
    for i in range(iterations):
        black_scholes_cuda[griddim, blockdim, stream](
            d_callResult, d_putResult, d_stockPrice, d_optionStrike,
            d_optionYears, RISKFREE, VOLATILITY)
        d_callResult.to_host(stream)
        d_putResult.to_host(stream)
        stream.synchronize()
    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)
    print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))

    delta = np.abs(callResultNumpy - callResultNumba)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm: %E" % L1norm)
    print("Max absolute error: %E" % delta.max())

    delta = np.abs(callResultNumpy - callResultNumbapro)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm (Numbapro): %E" % L1norm)
    print("Max absolute error (Numbapro): %E" % delta.max())
Example #33
0
def spca_full(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int(math.ceil((4. / epsilon) ** d))
    print(initNumSamples)
    maxSize = 6400

    ##actual algorithm
    opt_x = np.zeros((p, 1), dtype=float_dtype)
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()

    # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
    #                    descending=True)

    prng = curand.PRNG(stream=custr)
    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        # print(Vd.dtype)
        # print('dA', (Vd.shape[0], numSamples))
        # print('dI', (k, numSamples))

        dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F',
                               dtype=Vd.dtype)
        dI = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               dtype=np.uint32,
                               order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype)
        dC = cuda.device_array(shape=(d, numSamples), order='F',
                               dtype=Vd.dtype)

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        nn = Vd.shape[0]
        segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn
        blksz = 32
        init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)),
                     (blksz, blksz), custr](dI)
        segmented_sort(dA, dI, segments, stream=custr)

        # async_dA = dA.bind(custr)
        # async_dI = dI.bind(custr)

        # selnext = sorter.batch_argselect(dtype=dA.dtype,
        #                                  count=dA.shape[0],
        #                                  k=k,
        #                                  reverse=True)
        # for i in range(numSamples):
        #     dIi = selnext(async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # for i in range(numSamples):
        #     # radix_argselect(async_dA[:, i], k=k, stream=custr,
        #     #                 storeidx=async_dI[:, i])
        #     dIi = sorter.argselect(k, async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)



        # Replaces: val = np.linalg.norm(a[I[-k:]])
        # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
        #                                                              daInorm)

        dA = dA.bind(custr)[-k:]
        dI = dI.bind(custr)[-k:]
        batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                Ik = dI[:, i].copy_to_host()
                aIk = dA[:, i].copy_to_host().reshape(k, 1)
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x
Example #34
0
def spca(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int((4. / epsilon) ** d)

    maxSize = 32000

    ##actual algorithm
    opt_x = np.zeros((p, 1))
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()
    prng = curand.PRNG(stream=custr)

    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F')
        dI = cuda.device_array(shape=(k, numSamples), dtype=np.int16, order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=np.float64)
        dC = cuda.device_array(shape=(d, numSamples), order='F')

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        batch_k_selection[numSamples, Vd.shape[0], custr](dA, dI, k)

        # Replaces: val = np.linalg.norm(a[I[-k:]])
        batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
                                                                     daInorm)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                a = gpu_slice(dA, i).reshape(p, 1)
                Ik = gpu_slice(dI, i).reshape(k, 1)
                aIk = a[Ik]
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x
def gaussian(method, params, n_rep=DEFAULT_TRIALS, compare=None, dtype=np.float32):  

	"""
	method, compare can be : "original", "parallel_base", "cuda"	
	dtype float32 is faster for GPU but has lower precision
	dtype float64 is faster for CPU
	"""

	# expand params
	a_facGo_mean, a_facGo_sd, b_facGo_mean, b_facGo_sd, c_facGo_mean, c_facGo_sd, \
	inhib_mean, inhib_sd = params

	# create data structures
	t = np.linspace(-.4, .2, 600, endpoint=False).astype(dtype)  
	#tau_facGo = 2  # Currently set, but will need to optomize

	# generates n_rep random numbers from a normal distribution of mean, sd that given into function
	a_facGo = np.random.normal(a_facGo_mean, a_facGo_sd, size=n_rep).astype(dtype) 
	b_facGo = np.random.normal(b_facGo_mean, b_facGo_sd, size=n_rep).astype(dtype) 
	c_facGo = np.random.normal(c_facGo_mean, c_facGo_sd, size=n_rep).astype(dtype) 

	inhib_tonic = np.zeros((n_rep, t.size))	
	inhib = np.random.normal(inhib_mean, inhib_sd, size=n_rep)
	inhib_tonic += inhib[:,np.newaxis]

	# sets up empty array of zeros for all simulated trials
	fac1 = np.zeros((n_rep, t.size)).astype(dtype) 
	facs = [fac1]
	if compare:
		fac2 = np.zeros((n_rep, t.size)).astype(dtype)  
		facs = facs + [fac2]
		

	# Execute trials and compare performance and results if required 
	tps = [0, 0] # trials per second for each method
	for fi, f in enumerate([method, compare]):
		if f: # check if method or comapre is not None
			fac = facs[fi] # get the right fac
			t_start = time()
	
			if (f == "original"):
				for i in range(n_rep):  # for each simulated trial
					myparams_fac = a_facGo[i], b_facGo[i], c_facGo[i]  
					 # generates curve for that simulated trial
					fac[i] = _gaussian_original(t, myparams_fac) 
			elif (f == "parallel_base"):
				_gaussian_parallel_base(fac, n_rep, t, len(t), a_facGo, b_facGo, c_facGo)
			elif (f == "cuda"):
				# Setup CUDA variables
				tpb_x = 8 # threads per block in x dimension
				tpb_y = 8 # threads per block in y dimension
				block_dim = tpb_x, tpb_y
				bpg_x = int(n_rep / tpb_x) + 1 # block grid x dimension
				bpg_y = int(t.size / tpb_y) + 1 # block grid y dimension
				grid_dim = bpg_x, bpg_y
				
				stream = cuda.stream()
				with stream.auto_synchronize():
					d_fac = cuda.to_device(fac, stream)
					d_t = cuda.to_device(t, stream)
					d_a_facGo = cuda.to_device(a_facGo, stream)
					d_b_facGo = cuda.to_device(b_facGo, stream)
					d_c_facGo = cuda.to_device(c_facGo, stream)
					#print "CUDA kernel: Block dim: ({tx}, {ty}), Grid dim: ({gx}, {gy})".format(tx=tpb_x, ty=tpb_y, gx=bpg_x, gy=bpg_y)
					if dtype == np.float32:
						_gaussian_cuda32[grid_dim, block_dim](d_fac, n_rep, d_t, len(t), d_a_facGo, d_b_facGo, d_c_facGo)
					elif dtype == np.float64:
						_gaussian_cuda64[grid_dim, block_dim](d_fac, n_rep, d_t, len(t), d_a_facGo, d_b_facGo, d_c_facGo)
					else:
						print "Error: CUDA dtype must be np.float32 or np.float64"
						sys.exit(1)
					d_fac.to_host(stream)
	
			t_diff = time() - t_start
			tps[fi] = n_rep / t_diff

	# Check results close enough
	if compare:
		close = np.allclose(facs[0], facs[1], rtol=0, atol=1e-05)
		if not close:	
			print "ERROR: results from method '%s' are not the same as method '%s'" % (method, compare)
			#print (facs[1] - facs[0])
			sys.exit(1);


	# Summary
	print "%s trials per second: %.0f" % (method, tps[0])
	if compare:
		print "%s trials per second: %.0f" % (compare, tps[1])
		print "Speed up: %.3f x" %(tps[0]/tps[1]) # method / compare
		print "Results close enough? ", close

	return fac1, inhib_tonic, t
def main():
    # Build Filter
    laplacian_pts = """
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    """.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    # Build Image
    try:
        filename = sys.argv[1]
        image = ndimage.imread(filename, flatten=True).astype(np.float32)
    except IndexError:
        image = misc.lena().astype(np.float32)

    print("Image size: %s" % (image.shape,))

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode="same")
    te = timer()
    print("CPU: %.2fs" % (te - ts))

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print("kernel config: %s x %s" % (blockpergrid, threadperblock))

    # Trigger initialization the cuFFT system.
    # This takes significant time for small dataset.
    # We should not be including the time wasted here
    cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    stream1 = cuda.stream()
    stream2 = cuda.stream()

    fftplan1 = cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream1)
    fftplan2 = cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream2)

    # pagelock memory
    with cuda.pinned(image_complex, response_complex):

        # We can overlap the transfer of response_complex with the forward FFT
        # on image_complex.
        d_image_complex = cuda.to_device(image_complex, stream=stream1)
        d_response_complex = cuda.to_device(response_complex, stream=stream2)

        fftplan1.forward(d_image_complex, out=d_image_complex)
        fftplan2.forward(d_response_complex, out=d_response_complex)

        stream2.synchronize()

        mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex, d_response_complex)
        fftplan1.inverse(d_image_complex, out=d_image_complex)

        # implicitly synchronizes the streams
        cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print("GPU: %.2fs" % (te - ts))

    # Plot the results
    plt.subplot(1, 2, 1)
    plt.title("CPU")
    plt.imshow(cvimage_cpu, cmap=plt.cm.gray)
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.title("GPU")
    plt.imshow(cvimage_gpu, cmap=plt.cm.gray)
    plt.axis("off")

    plt.show()
Example #37
0
def reduce_by_key(input_data, chunk_id, literal, length):
	length = numpy.int64(len(input_data))
	bin_length = max(len(bin(length-1)),len(bin(tpb-1)))
	thread_num = numpy.int64(math.pow(2,bin_length))
	block_num = max(thread_num/tpb,1)

	flag = numpy.zeros(thread_num, dtype='int64')
	arg_useless = numpy.zeros(thread_num, dtype='int64')
	stream = cuda.stream()
	d_flag = cuda.to_device(flag, stream)
	d_chunk_id = cuda.to_device(chunk_id, stream)
	d_literal = cuda.to_device(literal, stream)
	
	produce_flag[block_num,tpb](input_data, d_chunk_id, length, d_flag)
	d_flag.to_host(stream)
	stream.synchronize()
	
	start_pos = numpy.ones(length, dtype='int64') * (-1)
	
	radix_sort.Blelloch_scan_caller(d_flag, arg_useless, 0)
	
	d_start_pos = cuda.to_device(start_pos, stream)
	dd_flag = cuda.to_device(flag, stream)
	
	print 'flag'
	print flag[:length]
	#d_flag.to_host(stream)
	#print 'd_flag'
	#print flag[:length]
	

	get_startPos[(length-1)/tpb+1, tpb](dd_flag, d_flag, d_start_pos, length)
	d_start_pos.to_host(stream)
	stream.synchronize()
	
	start_pos = filter(lambda x: x>=0, start_pos)

	reduced_length = len(start_pos)

	start_pos = list(start_pos)
	start_pos.append(length)

	reduced_input_data = []
	reduced_chunk_id = []
	reduced_literal =[]

	
	for i in xrange(reduced_length):
		print start_pos[i], start_pos[i+1]
		data_to_reduce = literal[start_pos[i]:start_pos[i+1]]

		print data_to_reduce
		reduce_block_num = (len(data_to_reduce)-1)/tpb + 1 
		
		tmp_out = numpy.zeros(reduce_block_num, dtype='uint32')
		d_tmp_out = cuda.to_device(tmp_out, stream)

		or_reduction[reduce_block_num, tpb](numpy.array(data_to_reduce), d_tmp_out,len(data_to_reduce))

		d_tmp_out.to_host(stream)
		stream.synchronize()
		result = 0x00000000
		for j in xrange(reduce_block_num):
			result |= tmp_out[j]
		
		reduced_input_data.append(input_data[start_pos[i]])
		reduced_chunk_id.append(chunk_id[start_pos[i]])
		reduced_literal.append(result)
	print '************!!!!!!!!!!!!!!!****************'

	
	return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
Example #38
0
def main (*args):
    OPT_N = 4000000
    iterations = 10
    if len(args) >= 2:
        iterations = int(args[0])

    callResultNumpy = np.zeros(OPT_N)
    putResultNumpy = -np.ones(OPT_N)
    stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
    optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
    optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)
    callResultNumba = np.zeros(OPT_N)
    putResultNumba = -np.ones(OPT_N)
    callResultNumbapro = np.zeros(OPT_N)
    putResultNumbapro = -np.ones(OPT_N)

    time0 = time.time()
    for i in range(iterations):
        black_scholes(callResultNumpy, putResultNumpy, stockPrice,
                      optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numpy Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    for i in range(iterations):
        black_scholes_numba(callResultNumba, putResultNumba, stockPrice,
                            optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numba Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
    stream = cuda.stream()
    d_callResult = cuda.to_device(callResultNumbapro, stream)
    d_putResult = cuda.to_device(putResultNumbapro, stream)
    d_stockPrice = cuda.to_device(stockPrice, stream)
    d_optionStrike = cuda.to_device(optionStrike, stream)
    d_optionYears = cuda.to_device(optionYears, stream)
    time1 = time.time()
    for i in range(iterations):
        black_scholes_cuda[griddim, blockdim, stream](
            d_callResult, d_putResult, d_stockPrice, d_optionStrike,
            d_optionYears, RISKFREE, VOLATILITY)
        d_callResult.to_host(stream)
        d_putResult.to_host(stream)
        stream.synchronize()
    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)
    print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))

    delta = np.abs(callResultNumpy - callResultNumba)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm: %E" % L1norm)
    print("Max absolute error: %E" % delta.max())

    delta = np.abs(callResultNumpy - callResultNumbapro)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm (Numbapro): %E" % L1norm)
    print("Max absolute error (Numbapro): %E" % delta.max())
Example #39
0
def get_pic_path(path):
	#print 'open source file in bitmap_pickle: '.strip()
	start = time.time()
	attr_dict,attr_values,attr_value_NO,attr_list, data_pic_path = data_pickle.openfile(path)
	end = time.time()
	#print str(end-start)

	#print 'index part(get bitmap, keylength and offset): '.strip()
	start = time.time()
	attr_num = len(attr_list)
	lists = [[]for i in xrange(attr_num)]
	key = [[]for i in xrange(attr_num)]
	offset = [[]for i in xrange(attr_num)]

	# attr_num = 1
	total_row = len(attr_values[0])
	for idx in range(attr_num):
		input_data = numpy.array(attr_values[idx])
		length = input_data.shape[0]
		rid = numpy.arange(0,length)
		#step1 sort
		#print 'time in step1--sort:'
		start = time.time()
		radix_sort.radix_sort(input_data,rid)
		end = time.time()
		#print str(end-start)		

		cardinality = len(attr_value_NO[idx].items())
		literal = numpy.zeros(length, dtype = 'uint32')
		chunk_id = numpy.zeros(length, dtype = 'int64')

		#print 'time in step2--produce chId_lit:'
		start = time.time()
		stream = cuda.stream()
		#d_rid = cuda.to_device(rid, stream)
		d_chunk_id = cuda.to_device(chunk_id, stream)
		d_literal = cuda.to_device(literal, stream)
		#step2 produce chunk_id and literal
		produce_chId_lit_gpu[length/tpb+1, tpb](rid, d_literal, d_chunk_id, length)
		
		#d_rid.to_host(stream)
		d_chunk_id.to_host(stream)
		d_literal.to_host(stream)
		stream.synchronize()
		end = time.time()
		#print str(end-start)

		#step3 reduce by key(value, chunk_id)
		#print 'time in step3--reduce by key:'
		start = time.time()
		reduced_input_data,	reduced_chunk_id, reduced_literal = reduce_by_key(input_data, chunk_id, literal, length)
		reduced_length = reduced_input_data.shape[0]#row
		end = time.time()
		#print str(end-start)
		#print '##############################reduced############################'
		#for i in xrange(reduced_length):
		#	print reduced_input_data[i], reduced_chunk_id[i], bin(reduced_literal[i])

		#step4 produce 0-Fill word
		#print 'time in step4--produce 0-fill word:'
		start = time.time()
		fill_word, head = produce_fill(reduced_input_data, reduced_chunk_id, reduced_length)
		end = time.time()
		#print str(end-start)

		#step 5 & 6: get index by interleaving 0-Fill word and literal(also remove all-zeros word)
		#print 'time in step5--get out_index & length & offset:'
		start = time.time()
		out_index, offsets, key_length = getIdx(fill_word,reduced_literal, reduced_length, head, cardinality)
		end = time.time()
		#print str(end-start)

		lists[idx] = out_index
		key[idx] = key_length
		offset[idx] = offsets
	end = time.time()
	#print str(end-start)
	'''
	print '*****************index:'
	print lists
	print '*****************length:'
	print key
	print '*****************offset:'
	print offset
	'''

	print 'put index result into file: '.strip()
	start = time.time()
	bitmap_pic_path = 'bitmap_pic.pkl'
	f1 = open(bitmap_pic_path, 'wb')
	pickle.dump(lists, f1, True)
	pickle.dump(key, f1, True)
	pickle.dump(offset, f1, True)
	f1.close()
	end = time.time()
	print str(end-start)
	return data_pic_path, bitmap_pic_path, attr_num
Example #40
0
 def _update(d):
     
     stream1 = cuda.stream()
     stream2 = cuda.stream()
     stream3 = cuda.stream()
     stream4 = cuda.stream()
     
     step = d['step']
     
     #print "Step: {}".format(step)
     
     """Calculate the pressure gradient. Two steps are needed for this."""
     # Calculate FFT of pressure.
     fft(d['field']['p'], d['temp']['fft_p'], stream=stream1)    
     
     stream1.synchronize()
     #print "FFT pressure: {}".format(d['temp']['fft_p'].copy_to_host())
     
     #pressure_exponent_x = exp(pressure_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!!
     #pressure_exponent_y = exp(pressure_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!!
     
             
     #print(d['spacing'].shape)
     #print(d['k_x'].shape)
     
     ex = cuda.device_array(shape=d['field']['p'].shape)
     
     print(d['k_x'].shape)
     print(d['spacing'].shape)
     print(d['k_x'].dtype)
     print(d['spacing'].dtype)
     print(pressure_gradient_exponent(d['k_x'], d['spacing']))
     
     ex = pressure_gradient_exponent(d['k_x'], d['spacing'])#, stream=stream1)
     ey = pressure_gradient_exponent(d['k_y'], d['spacing'])#, stream=stream2)
     
     pressure_exponent_x = exp(ex, stream=stream1) # This is a constant!!
     pressure_exponent_y = exp(ey, stream=stream2) # This is a constant!!
     
     
     stream1.synchronize()
     stream2.synchronize()
     
     #print ( to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x) ).copy_to_host()
     
     """Calculate the velocity gradient."""
     ifft(to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x, stream=stream1), d['temp']['d_p_d_x'], stream=stream1)
     ifft(to_gradient(d['temp']['fft_p'], d['k_y'], d['kappa'], pressure_exponent_y, stream=stream2), d['temp']['d_p_d_y'], stream=stream2) 
     
     #print "Pressure gradient x: {}".format( d['temp']['d_p_d_x'].copy_to_host() )
     #print "Pressure gradient y: {}".format( d['temp']['d_p_d_y'].copy_to_host() )
     
     """Calculate the velocity."""
     d['field']['v_x'] = velocity_with_pml(d['field']['v_x'], d['temp']['d_p_d_x'], d['timestep'], d['density'], d['abs_exp']['x'], d['source']['v']['x'][step], stream=stream1)
     d['field']['v_y'] = velocity_with_pml(d['field']['v_y'], d['temp']['d_p_d_y'], d['timestep'], d['density'], d['abs_exp']['y'], d['source']['v']['y'][step], stream=stream2)
 
 
     stream1.synchronize()
     stream2.synchronize()
     
     """Fourier transform of the velocity."""
     fft(d['field']['v_x'], d['temp']['fft_v_x'], stream=stream1)
     fft(d['field']['v_y'], d['temp']['fft_v_y'], stream=stream2)
     
     stream1.synchronize()
     stream2.synchronize()
     
     
     #print d['temp']['fft_v_y'].copy_to_host()
     #print "Velocity x: {}".format(d['field']['v_x'].copy_to_host())
     #print "Velocity y: {}".format(d['field']['v_y'].copy_to_host())
     
     #print "Source: {}".format(d['source']['p'][step].copy_to_host())
     
     #print "Source: {}".format(d['source']['p'])
     
     
     #print "Velocity exponent y: {}".format(velocity_exponent_y.copy_to_host())
     
     stream1.synchronize()
     stream2.synchronize()
     
     #stream3.synchronize()
     #stream4.synchronize()
     
     velocity_exponent_x = exp(velocity_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!!
     velocity_exponent_y = exp(velocity_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!!
     
     
     ifft(to_gradient(d['temp']['fft_v_x'], d['k_x'], d['kappa'], velocity_exponent_x, stream=stream1), d['temp']['d_v_d_x'], stream=stream1)
     ifft(to_gradient(d['temp']['fft_v_y'], d['k_y'], d['kappa'], velocity_exponent_y, stream=stream2), d['temp']['d_v_d_y'], stream=stream2)
 
     """And finally the pressure."""
     
     #print len([ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ])
     #pressure_with_pml(  d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step]  )
     #for i in [ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ]:
         #print i , i.shape
         #print i.copy_to_host()
         #try:
             #print i.dtype
         #except AttributeError:
             #print 'None'
     
     stream1.synchronize()
     stream2.synchronize()
     
     #print "Velocity gradient x: {}".format(d['temp']['d_v_d_x'].copy_to_host())
     #print "Velocity gradient y: {}".format(d['temp']['d_v_d_y'].copy_to_host())
     
     #print "Pressure x previous: {}".format(d['temp']['p_x'].copy_to_host())
     #print "Pressure y previous: {}".format(d['temp']['p_y'].copy_to_host())
 
     #print "Abs exp x: {}".format( d['abs_exp']['x'].copy_to_host())
     #print "Abs exp y: {}".format( d['abs_exp']['y'].copy_to_host())
     
     d['temp']['p_x'] = pressure_with_pml(d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step], stream=stream1)
     d['temp']['p_y'] = pressure_with_pml(d['temp']['p_y'], d['temp']['d_v_d_y'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['y'], d['source']['p'][step], stream=stream2)
 
     stream1.synchronize()
     stream2.synchronize()
     
     #try:
         #print "Source p: {}".format(d['source']['p'][step].copy_to_host())
     #except AttributeError:
         #print "Source p: {}".format(d['source']['p'][step])
         
     #print "Pressure x: {}".format(d['temp']['p_x'].copy_to_host())
     #print "Pressure y: {}".format(d['temp']['p_y'].copy_to_host())
 
     d['field']['p'] = add(d['temp']['p_x'], d['temp']['p_y'], stream=stream3)
     
     #stream3.synchronize()
     #print "Pressure total: {}".format(d['field']['p'].copy_to_host())
     
     
     stream1.synchronize()
     stream2.synchronize()
     stream3.synchronize()
     
     return d
Example #41
0
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print 'regular', te - ts

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
    ts = timer()
    stream = cuda.stream()  # use stream to trigger async memory transfer
    d_src = cuda.to_device(src, stream=stream)
    d_dst = cuda.device_array_like(dst, stream=stream)

    copy_kernel(d_src, out=d_dst, stream=stream)

    d_dst.copy_to_host(dst, stream=stream)
    stream.synchronize()
    te = timer()
    print 'pinned', te - ts

assert np.allclose(dst, src)
Example #42
0
def spca_full(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int(math.ceil((4. / epsilon)**d))
    print(initNumSamples)
    maxSize = 6400

    ##actual algorithm
    opt_x = np.zeros((p, 1), dtype=float_dtype)
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()

    # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
    #                    descending=True)

    prng = curand.PRNG(stream=custr)
    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        # print(Vd.dtype)
        # print('dA', (Vd.shape[0], numSamples))
        # print('dI', (k, numSamples))

        dA = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               order='F',
                               dtype=Vd.dtype)
        dI = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               dtype=np.uint32,
                               order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype)
        dC = cuda.device_array(shape=(d, numSamples),
                               order='F',
                               dtype=Vd.dtype)

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        nn = Vd.shape[0]
        segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn
        blksz = 32
        init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)),
                     (blksz, blksz), custr](dI)
        segmented_sort(dA, dI, segments, stream=custr)

        # async_dA = dA.bind(custr)
        # async_dI = dI.bind(custr)

        # selnext = sorter.batch_argselect(dtype=dA.dtype,
        #                                  count=dA.shape[0],
        #                                  k=k,
        #                                  reverse=True)
        # for i in range(numSamples):
        #     dIi = selnext(async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # for i in range(numSamples):
        #     # radix_argselect(async_dA[:, i], k=k, stream=custr,
        #     #                 storeidx=async_dI[:, i])
        #     dIi = sorter.argselect(k, async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # Replaces: val = np.linalg.norm(a[I[-k:]])
        # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
        #                                                              daInorm)

        dA = dA.bind(custr)[-k:]
        dI = dI.bind(custr)[-k:]
        batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                Ik = dI[:, i].copy_to_host()
                aIk = dA[:, i].copy_to_host().reshape(k, 1)
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x
def reduce_by_key(input_data, chunk_id, literal, length):
    length = numpy.int64(len(input_data))
    bin_length = max(len(bin(length - 1)), len(bin(tpb - 1)))
    thread_num = numpy.int64(math.pow(2, bin_length))
    block_num = max(thread_num / tpb, 1)

    flag = numpy.zeros(thread_num, dtype='int64')
    arg_useless = numpy.zeros(thread_num, dtype='int64')
    stream = cuda.stream()
    d_flag = cuda.to_device(flag, stream)
    d_chunk_id = cuda.to_device(chunk_id, stream)
    d_literal = cuda.to_device(literal, stream)

    produce_flag[block_num, tpb](input_data, d_chunk_id, length, d_flag)
    d_flag.to_host(stream)
    stream.synchronize()

    start_pos = numpy.ones(length, dtype='int64') * (-1)

    radix_sort.Blelloch_scan_caller(d_flag, arg_useless, 0)

    d_start_pos = cuda.to_device(start_pos, stream)
    dd_flag = cuda.to_device(flag, stream)

    get_startPos[(length - 1) / tpb + 1, tpb](dd_flag, d_flag, d_start_pos,
                                              length)
    d_start_pos.to_host(stream)
    stream.synchronize()

    start_pos = filter(lambda x: x >= 0, start_pos)
    reduced_length = len(start_pos)
    start_pos = list(start_pos)
    start_pos.append(length)
    #print reduced_length

    reduced_input_data = numpy.zeros(reduced_length, dtype='int32')
    reduced_chunk_id = numpy.zeros(reduced_length, dtype='int64')
    reduced_literal = numpy.zeros(reduced_length, dtype='uint32')

    #print 'append stage in reduce_by_key:'
    start = time.time()
    dd_start_pos = cuda.to_device(numpy.array(start_pos), stream)
    d_reduced_chunk_id = cuda.to_device(reduced_chunk_id, stream)
    d_reduced_literal = cuda.to_device(reduced_literal, stream)
    d_reduced_input_data = cuda.to_device(reduced_input_data, stream)

    block_num = (reduced_length - 1) / tpb + 1
    get_reduced[block_num, tpb](d_literal, dd_start_pos, reduced_length,
                                d_reduced_literal, input_data, d_chunk_id,
                                d_reduced_input_data,
                                d_reduced_chunk_id)  #kernel function

    d_reduced_literal.to_host(stream)
    d_reduced_chunk_id.to_host(stream)
    d_reduced_input_data.to_host(stream)
    stream.synchronize()
    '''
	reduced_input_data = []
	reduced_chunk_id = []
	reduced_literal =[]
	for i in xrange(reduced_length):
		data_to_reduce = literal[start_pos[i]:start_pos[i+1]]

		reduce_block_num = (len(data_to_reduce)-1)/tpb + 1 
		
		tmp_out = numpy.zeros(reduce_block_num, dtype='uint32')
		d_tmp_out = cuda.to_device(tmp_out, stream)
		start = time.time()
		or_reduction[reduce_block_num, tpb](numpy.array(data_to_reduce), d_tmp_out,len(data_to_reduce))
		end = time.time()
		print str(end-start)
		d_tmp_out.to_host(stream)
		stream.synchronize()
		result = 0x00000000
		for j in xrange(reduce_block_num):
			result |= tmp_out[j]
		
		reduced_input_data.append(input_data[start_pos[i]])
		reduced_chunk_id.append(chunk_id[start_pos[i]])
		reduced_literal.append(result)
	'''
    end = time.time()
    #print str(end-start)
    return numpy.array(reduced_input_data), numpy.array(
        reduced_chunk_id), reduced_literal
Example #44
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2

    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [
        int(math.ceil(float(partlen) / blksz)) for partlen in partlens
    ]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [
        curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist
    ]

    # Allocate device side array
    d_normlist = [
        cuda.device_array(partlen, dtype=np.double, stream=strm)
        for partlen, strm in zip(partlens, strmlist)
    ]

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [
        cu_step[gridsz, blksz, strm]
        for gridsz, strm in zip(gridszlist, strmlist)
    ]

    d_lastlist = [
        cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
        for (s, e), strm in zip(partitions, strmlist)
    ]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [
            cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm))
            for (s, e), strm in zip(partitions, strmlist)
        ]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist,
                                            d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Example #45
0
    
    if price < 0:
        price = 0 # lower bound

    return x, price
   
#upload memory to gpu
bpg = 50
tpb = 32

nView = 5
steps = 2000

initialPrice = 100

stream = cuda.stream() #initialize memory stream

# instantiate a cuRAND PRNG
prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

paths = 1

pricePath = []

for j in range(paths):
    print "Generating path: %s" % j
    # plotting lists
    LogReturns, nLogReturns = [0], [0] # log returns, normalized log returns
    xchange, xcorrelation = [], [] # change in price P, and autocorrelation
    activeTraders = [] # number of active traders
    prices = [initialPrice]
Example #46
0
	print input_data
	
	f1 = open('input_data.txt', 'w')
	f1.write(str(list(input_data)))
	f2 = open("rid.txt", 'w')
	f2.write(str(list(rid)))
	f1.close()
	f2.close()

	#cardinality = input_data[-1]+1 
	cardinality = len(attr_dict['worker_class'])
	print 'rid:\n',rid
	literal = numpy.zeros(length, dtype = 'uint32')
	chunk_id = numpy.zeros(length, dtype = 'int64')
	
	stream = cuda.stream()
	d_rid = cuda.to_device(rid, stream)
	d_chunk_id = cuda.to_device(chunk_id, stream)
	d_literal = cuda.to_device(literal, stream)
	#step2 produce chunk_id and literal
	produce_chId_lit_gpu[length/tpb+1, tpb](d_rid, d_literal, d_chunk_id)
	d_rid.to_host(stream)
	d_chunk_id.to_host(stream)
	d_literal.to_host(stream)
	stream.synchronize()
	print chunk_id
	for i in literal:
		print i
	#step3 reduce by key(value, chunk_id)
	reduced_input_data,	reduced_chunk_id, reduced_literal = reduce_by_key(input_data, chunk_id, literal, length)
	reduced_length = reduced_input_data.shape[0]#row
def main():
    cu_discriminant = vectorize([f4(f4, f4, f4), f8(f8, f8, f8)],
                                target='gpu')(poly.discriminant)

    N = 1e+8 // 2

    print 'Data size', N
    
    A, B, C = poly.generate_input(N, dtype=np.float32)
    D = np.empty(A.shape, dtype=A.dtype)

    stream = cuda.stream()

    print '== One'

    ts = time()

    with stream.auto_synchronize():
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)
        dC = cuda.to_device(C, stream)
        dD = cuda.to_device(D, stream, copy=False)
        cu_discriminant(dA, dB, dC, out=dD, stream=stream)
        dD.to_host(stream)

    te = time()
    

    total_time = (te - ts)

    print 'Execution time %.4f' % total_time
    print 'Throughput %.2f' % (N / total_time)

    print '== Chunked'

    chunksize = 1e+7
    chunkcount = N // chunksize

    print 'Chunk size', chunksize

    sA = np.split(A, chunkcount)
    sB = np.split(B, chunkcount)
    sC = np.split(C, chunkcount)
    sD = np.split(D, chunkcount)

    device_ptrs = []

    ts = time()

    with stream.auto_synchronize():
        for a, b, c, d in zip(sA, sB, sC, sD):
            dA = cuda.to_device(a, stream)
            dB = cuda.to_device(b, stream)
            dC = cuda.to_device(c, stream)
            dD = cuda.to_device(d, stream, copy=False)
            cu_discriminant(dA, dB, dC, out=dD, stream=stream)
            dD.to_host(stream)
            device_ptrs.extend([dA, dB, dC, dD])

    te = time()

    total_time = (te - ts)

    print 'Execution time %.4f' % total_time
    print 'Throughput %.2f' % (N / total_time)


    if '-verify' in sys.argv[1:]:
        poly.check_answer(D, A, B, C)
    # attr_num = 1
    total_row = len(attr_values[0])
    for idx in range(attr_num):
        input_data = numpy.array(attr_values[idx])
        length = input_data.shape[0]
        rid = numpy.arange(0, length, dtype='int64')

        #step1 sort
        radix_sort.radix_sort(input_data, rid)
        print rid
        print rid.dtype
        cardinality = len(attr_value_NO[idx].items())
        literal = numpy.zeros(length, dtype='uint32')
        chunk_id = numpy.zeros(length, dtype='int64')

        stream = cuda.stream()
        #d_rid = cuda.to_device(rid, stream)
        d_chunk_id = cuda.to_device(chunk_id, stream)
        d_literal = cuda.to_device(literal, stream)
        #step2 produce chunk_id and literal
        produce_chId_lit_gpu[length / tpb + 1, tpb](rid, d_literal, d_chunk_id,
                                                    length)
        #d_rid.to_host(stream)
        d_chunk_id.to_host(stream)
        d_literal.to_host(stream)
        stream.synchronize()
        print '!!!!!!!!!!!!!!!!!!!!!!!!!!chunk_id:!!!!!!!!!!!!!!!!!!!'
        print chunk_id
        #step3 reduce by key(value, chunk_id)
        reduced_input_data, reduced_chunk_id, reduced_literal = reduce_by_key(
            input_data, chunk_id, literal, length)
Example #49
0
def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2,bin_length))
    block_num = max(thread_num/TPB_MAX,1)

    print 'length: %d'%length
    print 'bin_length: %d'%bin_length
    print 'thread_num: %d'%thread_num
    print 'block_num: %d'%block_num

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    print 'iter_num: %d'%iter_num
    for i in range(iter_num):
        print '***************************'
        print 'iteration_%d:'%i
        print arr
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list,stream)
        d_one_list = cuda.to_device(one_list,stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()
        print 'zero_list:'
        print zero_list
        print 'one_list'
        print one_list
        
        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        
        print 'base_reduction_block_num: %d'%base_reduction_block_num
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0 #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]
        print 'base: %d'%base

        #then do scanning(one_list and zero_list at the same time)
        print 'begin scan'
        Blelloch_scan_caller(d_zero_list, d_one_list, base)
        
        print 'scan finished'
        print
        #adjust array elements' position
        print 'begin adjust'
        print 'zero_list:'
        print zero_list
        array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
        print arr
        print
def get_pic_path(path):
    #print 'open source file in bitmap_pickle: '.strip()
    start = time.time()
    attr_dict, attr_values, attr_value_NO, attr_list, data_pic_path = data_pickle.openfile(
        path)
    end = time.time()
    #print str(end-start)

    #print 'index part(get bitmap, keylength and offset): '.strip()
    start = time.time()
    attr_num = len(attr_list)
    lists = [[] for i in xrange(attr_num)]
    key = [[] for i in xrange(attr_num)]
    offset = [[] for i in xrange(attr_num)]

    # attr_num = 1
    total_row = len(attr_values[0])
    for idx in range(attr_num):
        input_data = numpy.array(attr_values[idx])
        length = input_data.shape[0]
        rid = numpy.arange(0, length)
        #step1 sort
        #print 'time in step1--sort:'
        start = time.time()
        radix_sort.radix_sort(input_data, rid)
        end = time.time()
        #print str(end-start)

        cardinality = len(attr_value_NO[idx].items())
        literal = numpy.zeros(length, dtype='uint32')
        chunk_id = numpy.zeros(length, dtype='int64')

        #print 'time in step2--produce chId_lit:'
        start = time.time()
        stream = cuda.stream()
        #d_rid = cuda.to_device(rid, stream)
        d_chunk_id = cuda.to_device(chunk_id, stream)
        d_literal = cuda.to_device(literal, stream)
        #step2 produce chunk_id and literal
        produce_chId_lit_gpu[length / tpb + 1, tpb](rid, d_literal, d_chunk_id,
                                                    length)

        #d_rid.to_host(stream)
        d_chunk_id.to_host(stream)
        d_literal.to_host(stream)
        stream.synchronize()
        end = time.time()
        #print str(end-start)

        #step3 reduce by key(value, chunk_id)
        #print 'time in step3--reduce by key:'
        start = time.time()
        reduced_input_data, reduced_chunk_id, reduced_literal = reduce_by_key(
            input_data, chunk_id, literal, length)
        reduced_length = reduced_input_data.shape[0]  #row
        end = time.time()
        #print str(end-start)
        #print '##############################reduced############################'
        #for i in xrange(reduced_length):
        #	print reduced_input_data[i], reduced_chunk_id[i], bin(reduced_literal[i])

        #step4 produce 0-Fill word
        #print 'time in step4--produce 0-fill word:'
        start = time.time()
        fill_word, head = produce_fill(reduced_input_data, reduced_chunk_id,
                                       reduced_length)
        end = time.time()
        #print str(end-start)

        #step 5 & 6: get index by interleaving 0-Fill word and literal(also remove all-zeros word)
        #print 'time in step5--get out_index & length & offset:'
        start = time.time()
        out_index, offsets, key_length = getIdx(fill_word, reduced_literal,
                                                reduced_length, head,
                                                cardinality)
        end = time.time()
        #print str(end-start)

        lists[idx] = out_index
        key[idx] = key_length
        offset[idx] = offsets
    end = time.time()
    #print str(end-start)
    '''
	print '*****************index:'
	print lists
	print '*****************length:'
	print key
	print '*****************offset:'
	print offset
	'''

    print 'put index result into file: '.strip()
    start = time.time()
    bitmap_pic_path = 'bitmap_pic.pkl'
    f1 = open(bitmap_pic_path, 'wb')
    pickle.dump(lists, f1, True)
    pickle.dump(key, f1, True)
    pickle.dump(offset, f1, True)
    f1.close()
    end = time.time()
    print str(end - start)
    return data_pic_path, bitmap_pic_path, attr_num
def main():
    cu_discriminant = vectorize(
        [f4(f4, f4, f4), f8(f8, f8, f8)], target='gpu')(poly.discriminant)

    N = 1e+8 // 2

    print 'Data size', N

    A, B, C = poly.generate_input(N, dtype=np.float32)
    D = np.empty(A.shape, dtype=A.dtype)

    stream = cuda.stream()

    print '== One'

    ts = time()

    with stream.auto_synchronize():
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)
        dC = cuda.to_device(C, stream)
        dD = cuda.to_device(D, stream, copy=False)
        cu_discriminant(dA, dB, dC, out=dD, stream=stream)
        dD.to_host(stream)

    te = time()

    total_time = (te - ts)

    print 'Execution time %.4f' % total_time
    print 'Throughput %.2f' % (N / total_time)

    print '== Chunked'

    chunksize = 1e+7
    chunkcount = N // chunksize

    print 'Chunk size', chunksize

    sA = np.split(A, chunkcount)
    sB = np.split(B, chunkcount)
    sC = np.split(C, chunkcount)
    sD = np.split(D, chunkcount)

    device_ptrs = []

    ts = time()

    with stream.auto_synchronize():
        for a, b, c, d in zip(sA, sB, sC, sD):
            dA = cuda.to_device(a, stream)
            dB = cuda.to_device(b, stream)
            dC = cuda.to_device(c, stream)
            dD = cuda.to_device(d, stream, copy=False)
            cu_discriminant(dA, dB, dC, out=dD, stream=stream)
            dD.to_host(stream)
            device_ptrs.extend([dA, dB, dC, dD])

    te = time()

    total_time = (te - ts)

    print 'Execution time %.4f' % total_time
    print 'Throughput %.2f' % (N / total_time)

    if '-verify' in sys.argv[1:]:
        poly.check_answer(D, A, B, C)