def main():
    N = 2048 * 2048

    # Allocate host memory arrays
    a = np.empty(N)
    b = np.empty(N)
    c = np.empty(N)

    # Initialize host memory
    a.fill(2)
    b.fill(1)
    c.fill(0)

    # Allocate and copy GPU/device memory
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c)

    threads_per_block = 128
    number_of_blocks = 33000

    saxpy [ number_of_blocks, threads_per_block ] (d_a,d_b,d_c,)

    d_c.copy_to_host(c)

    # Print out the first and last 5 values of c for a quality check
    print str(c[0:5])
    print str(c[-5:])
Beispiel #2
0
 def fit(self,X,Budget=None,W=None):
     self.X = cuda.to_device(X.astype(np.float64,order='F'))
     self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F')
     self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F')
     self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     if Budget is None:
         permutation = np.random.permutation(self.X.shape[0])
         self.permutation = cuda.to_device(permutation)
         initBudget(self.X,self.permutation,self.Budget)
     else:
         self.Budget = cuda.to_device(Budget.astype(np.float64,order='F'))
     self.calculateKB()
     self.calculateKX()
     if W is None:
         self.initW()
     else:
         self.W = cuda.to_device(W.astype(np.float64,order='F'))
     self.t = 0
     for i in xrange(self.epochs):
         print "Epoch " + str(i)
         samples,features = self.X.shape
         permutation = getPermutation(samples,self.miniBatchSize)
         self.permutation = cuda.to_device(permutation)
         for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize):
             loadBatch(self.kx,self.permutation,j,self.kxi)
             self.nextW()
             self.t += 1
     self.predictH()
def getIdx(fill_word,reduced_literal, reduced_length, head, cardinality):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
	bin_length = max(len(bin(2*reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
	thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
	compact_flag = numpy.ones(thread_num, dtype='int64')
	index = numpy.ones(2*reduced_length, dtype='uint32')
	d_index = cuda.to_device(index)
	d_fill_word = cuda.to_device(fill_word)
	d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
	d_compact_flag = cuda.to_device(compact_flag)

	block_num = reduced_length/tpb + 1

	getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
	compact_flag = d_compact_flag.copy_to_host()

	useless_array = numpy.zeros(thread_num, dtype='int64')
	radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
	out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
	out_index = numpy.zeros(out_index_length, dtype='uint32')
	offsets = []
	
	new_block_num = 2*reduced_length/tpb + 1

	scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
	for i in xrange(reduced_length):
		if head[i]:
			offsets.append(d_compact_flag.copy_to_host()[2*i])

	key_length = numpy.zeros(cardinality, dtype='int64')

	for i in xrange(cardinality-1):
		key_length[i] = offsets[i+1] - offsets[i]
	key_length[cardinality-1] = out_index_length - offsets[cardinality-1]

	return out_index, numpy.array(offsets), numpy.array(key_length)	
Beispiel #4
0
def gpumulti(X,mu):
    device = cuda.get_current_device()
    
    n=len(X)
    X=np.array(X)
    x1 = np.array(X.T[0])
    x2 = np.array(X.T[1])
    
    bmk = np.arange(len(x1))
    
    mu = np.array(mu)
    
    dx1 = cuda.to_device(x1)
    dx2 = cuda.to_device(x2)
    dmu = cuda.to_device(mu)
    dbmk = cuda.to_device(bmk)
    
    # Set up enough threads for kernel
    tpb = device.WARP_SIZE
    bpg = int(np.ceil(float(n)/tpb))
        
    cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk)
    
    bestmukey = dbmk.copy_to_host()
    
    return bestmukey
Beispiel #5
0
def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2,bin_length))
    block_num = max(thread_num/TPB_MAX,1)

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    for i in range(iter_num):
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list,stream)
        d_one_list = cuda.to_device(one_list,stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()
        
        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0 #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]

        Blelloch_scan_caller(d_zero_list, d_one_list, base)

        array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
Beispiel #6
0
def run_GPU(grid, adjGrid, steps, delay, initDelay, printInd, indSteps):
    """ Runs the Command-Line interface for a specified number of steps,
        or forever if the number of steps is specified to be -1.
        Note that here, grid and adjGrid must be explicitly specified as
        opposed to passed in as a Game, to enable everything to be run on the
        GPU. Returns the final grid state. """
    step = 0
    dim = grid.shape
    # move arrays to GPU
    d_grid = cuda.to_device(grid)
    d_adjGrid = cuda.to_device(adjGrid)
    blockDim = (32,16)
    gridDim = (32,8)
    while step < steps or steps == -1:
        # print grid
        if printInd is not -1 and step % printInd is 0:
            # in order to print grid, first need memory back in CPU
            d_grid.to_host()
            printGrid(grid, step, dim)
        # print index
        if indSteps is not -1 and step % indSteps is 0:
            print("Step = " + str(step))
        newGrid = np.zeros_like(grid)
        d_newGrid = cuda.to_device(newGrid)
        evolve2D_kernel[gridDim, blockDim](d_grid, d_adjGrid, d_newGrid)
        d_grid = d_newGrid
        grid = newGrid
        sleep(delay)
        if step == 0:
            # allow initial position to be more easily visible
            sleep(initDelay)
        step += 1
    d_grid.to_host()
    return grid
def evaluation_function(factors, opts):

    start = timer()

    longest_wavelet, target_samples = opts['longest_wavelet'], opts['target_samples']
    window_width = len(target_samples)
    full_width = window_width + longest_wavelet
    num_wavelengths = longest_wavelet-2
    offsets_per_wavelet = full_width / num_wavelengths
    num_rows = offsets_per_wavelet * num_wavelengths

    result = np.zeros(window_width, dtype=np.float32)

    d_factors = cuda.to_device(factors)
    d_result = cuda.to_device(result)

    griddim = full_width, 1
    blockdim = 4, 1, 1

    compute_samples_configured = compute_sample_kernel.configure(griddim, blockdim)
    compute_samples_configured(d_factors, longest_wavelet, offsets_per_wavelet, d_result, num_rows)

    d_result.to_host()
    generated_samples_sum = sum(result)
    factors_sum = sum(factors)
    difference_from_target = math.fabs(sum(target_samples - result))
    non_zero_factors = filter(lambda x: x != 0.0, result)

    fun_value = difference_from_target + 10 * math.fabs(len(non_zero_factors))

    print("Value "+str(fun_value)+" generated in " + str((timer() - start)) + " seconds. Sample sum: " +
          str(generated_samples_sum)+". Factors sum: "+str(factors_sum))

    return fun_value
Beispiel #8
0
def main():
    N = 2048 * 2048

    # Allocate host memory arrays
    a = np.empty(N)
    b = np.empty(N)
    c = np.empty(N)

    # Initialize host memory
    a.fill(2)
    b.fill(1)
    c.fill(0)

    # Allocate and copy GPU/device memory
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c)

    threads_per_block = 128
    number_of_blocks = N / 128 + 1

    saxpy [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )

    d_c.copy_to_host(c)

    # Print out the first and last 5 values of c for a quality check
    print c[:5]
    print c[-5:]
def getIdx(fill_word,reduced_literal, reduced_length):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
	bin_length = max(len(bin(reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
	thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
	compact_flag = 	numpy.ones(thread_num, dtype='int64')
	print thread_num
	index = numpy.ones(2*reduced_length, dtype='uint32')
	d_index = cuda.to_device(index)
	d_fill_word = cuda.to_device(fill_word)
	d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
	d_compact_flag = cuda.to_device(compact_flag)
	#print fill_word
	getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
	compact_flag = d_compact_flag.copy_to_host()
	#print compact_flag[0:28]

	useless_array = numpy.zeros(thread_num, dtype='int64')
	radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
	out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
	print d_compact_flag.copy_to_host()[0:2*reduced_length]
	print out_index_length
	out_index = numpy.zeros(out_index_length, dtype='uint32')
	scatter_index[1,tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
	#for i in out_index:
	#	print bin(i)
	return out_index
Beispiel #10
0
	def train(self,ds,epochs,batch_size=10):

		for epoch in range(epochs):
			start = timer()
			count = 0.
			correct = 0.
			for i in range(len(ds)/batch_size):
				count += 1.
				x = encode(ds[i*batch_size][0],gpu=False)
				t = encode(ds[i*batch_size][1],gpu=False)
				for b in range(batch_size-1):
					x = np.concatenate((x,encode(ds[i*batch_size + b+1][0],gpu=False)))
					t = np.concatenate((t,encode(ds[i*batch_size + b+1][1],gpu=False)))
				x = cuda.to_device(x)
				t = cuda.to_device(t)
				assert x.shape[1] == self.layers[0]
				assert t.shape[1] == self.layers[2]
				print(x.shape)
				self.forward(x)
				print('output',decode(self.output))
				if decode(self.output) == decode(t):
					correct += 1.
				self.backward(t)
			print("Epoch",epoch,"Time:",timer()-start,'output',decode(self.output), 'Accuracy:',correct/count)
			if correct/count > 0.99:
				break
Beispiel #11
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    step_cfg = step[gridsz, blksz, stream]
    
    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last, stream=stream)
        d_last = d_paths

    stream.synchronize()
def reduce_by_key(input_data, chunk_id, literal, length):#step 3
	flag = numpy.ones(length, dtype='int32')
	stream = cuda.stream()
	d_flag = cuda.to_device(flag, stream)
	d_chunk_id = cuda.to_device(chunk_id, stream)
	d_literal = cuda.to_device(literal, stream)
	produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag)
	d_flag.to_host(stream)
	print 'flag:'
	print flag
	stream.synchronize()	
	is_finish = numpy.zeros(length, dtype='int32')
	hop = 1
	while hop<32:#only 32 because the length of a word in binary form is 32
		reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length)
		hop *= 2
	d_literal.to_host(stream)
	d_chunk_id.to_host(stream)
	stream.synchronize()

	reduced_input_data = []
	reduced_chunk_id = []
	reduced_literal =[]
	for i in xrange(length):
		if flag[i]:
			reduced_input_data.append(input_data[i])
			reduced_chunk_id.append(chunk_id[i])
			reduced_literal.append(literal[i])
	return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
Beispiel #13
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
Beispiel #14
0
def tests():
    a = np.random.rand(300,500)
    b = np.random.rand(500,300)

    start = timer()
    c = np.dot(a,b)
    nptime = timer()-start
    print('nptime',nptime)

    x = np.array(np.random.rand(600,1500),dtype='float32',order='F')
    y = np.array(np.random.rand(1500,300),dtype='float32',order='F')
    z = np.zeros((1000,1000),order='F',dtype='float32')

    stream = cuda.stream()

    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dz = cuda.to_device(z)

    start = timer()
    blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz)
    cutime = timer()-start
    print('cutime',cutime)

    #dz.copy_to_host(z)
    print(dz[0])

    c = np.ones((1000,1000),order='F',dtype='float32')
    print(c.shape)
    dc = cuda.to_device(c)

   # blockDim = (256,256)
    #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1]))

    blockDim = (30,30)
    gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1]))

    start = timer()
    mtanh[gridDim,blockDim,stream](dc)
    tantime = timer() - start
    print('tantime',tantime)

    dc.copy_to_host(c,stream=stream)
    stream.synchronize()
    print(c)

    y = cm.CUDAMatrix(np.ones((1000,1000)))

    start = timer()
    cm.tanh(y)
    cmtan = timer()-start
    print('cmtan',cmtan)

    x = cm.CUDAMatrix(np.random.rand(1000,1500))
    y = cm.CUDAMatrix(np.random.rand(1500,1000))

    start = timer()
    cm.dot(x,y)
    cmtime = timer()-start
    print('cmtime',cmtime)
Beispiel #15
0
def getIdx(
    fill_word, reduced_literal, reduced_length
):  #step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
    bin_length = max(len(bin(reduced_length - 1)), len(
        bin(tpb - 1)))  #the bit number of binary form of array length
    thread_num = numpy.int64(
        math.pow(2, bin_length)
    )  #Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
    compact_flag = numpy.ones(thread_num, dtype='int64')
    print thread_num
    index = numpy.ones(2 * reduced_length, dtype='uint32')
    d_index = cuda.to_device(index)
    d_fill_word = cuda.to_device(fill_word)
    d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
    d_compact_flag = cuda.to_device(compact_flag)
    #print fill_word
    getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag,
                       reduced_length)
    compact_flag = d_compact_flag.copy_to_host()
    #print compact_flag[0:28]

    useless_array = numpy.zeros(thread_num, dtype='int64')
    radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
    out_index_length = d_compact_flag.copy_to_host()[2 * reduced_length -
                                                     1] + 1
    print d_compact_flag.copy_to_host()[0:2 * reduced_length]
    print out_index_length
    out_index = numpy.zeros(out_index_length, dtype='uint32')
    scatter_index[1, tpb](d_index, d_compact_flag, compact_flag, out_index,
                          reduced_length)
    #for i in out_index:
    #	print bin(i)
    return out_index
Beispiel #16
0
def reduce_by_key(input_data, chunk_id, literal, length):  #step 3
    flag = numpy.ones(length, dtype='int32')
    stream = cuda.stream()
    d_flag = cuda.to_device(flag, stream)
    d_chunk_id = cuda.to_device(chunk_id, stream)
    d_literal = cuda.to_device(literal, stream)
    produce_flag[1, tpb](input_data, d_chunk_id, length, d_flag)
    d_flag.to_host(stream)
    print 'flag:'
    print flag
    stream.synchronize()
    is_finish = numpy.zeros(length, dtype='int32')
    hop = 1
    while hop < 32:  #only 32 because the length of a word in binary form is 32
        reduce_by_key_gpu[1, tpb](d_literal, d_flag, is_finish, hop, length)
        hop *= 2
    d_literal.to_host(stream)
    d_chunk_id.to_host(stream)
    stream.synchronize()

    reduced_input_data = []
    reduced_chunk_id = []
    reduced_literal = []
    for i in xrange(length):
        if flag[i]:
            reduced_input_data.append(input_data[i])
            reduced_chunk_id.append(chunk_id[i])
            reduced_literal.append(literal[i])
    return numpy.array(reduced_input_data), numpy.array(
        reduced_chunk_id), reduced_literal
Beispiel #17
0
def test_scan():

    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32)

    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100)

    tac1 = time()

    in_d = cuda.to_device(in_h)
    out_d = cuda.to_device(out_h)
    cuda.synchronize()

    tac2 = time()

    tk1 = time()

    for i in range(0, 32):
        tk1 = time()
        preScan(out_d, in_d, NUM_ELEMENTS)
        cuda.synchronize()
        tk2 = time()
        print i, tk2 - tk1
    tk2 = time()

    th1 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "Last = ", out_h[-1] + in_h[-1]

    th2 = time()
Beispiel #18
0
def main():
    NN = 4096
    NM = 4096

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)

    timer = time.time()
    iter = 0

    blockdim = (32, 32)
    griddim = (NN / blockdim[0], NM / blockdim[1])

    error_grid = np.zeros_like(A)

    stream = cuda.stream()

    dA = cuda.to_device(A, stream)  # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)  # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)

    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64

        jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

        derror_grid.to_host(stream)

        # error_grid is available on host
        stream.synchronize()

        error = np.abs(error_grid).max()

        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print "%5d, %0.6f (elapsed: %f s)" % (iter, error,
                                                  time.time() - timer)

        iter += 1

    runtime = time.time() - timer
    print " total: %f s" % runtime
def main():
    NN = 4096
    NM = 4096

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)

    timer = time.time()
    iter = 0

    blockdim = (tpb, tpb)
    griddim = (NN/blockdim[0], NM/blockdim[1])
        
    error_grid = np.zeros(griddim)
    
    stream = cuda.stream()

    dA = cuda.to_device(A, stream)          # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)
    
    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64
        
        jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
        
        derror_grid.to_host(stream)
        
        
        # error_grid is available on host
        stream.synchronize()
        
        error = np.abs(error_grid).max()
        
        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer)

        iter += 1

    runtime = time.time() - timer
    print " total: %f s" % runtime
Beispiel #20
0
def test(n=256, k=30, batch=100):
	""" Running test between (n x n) * (n x 100) multiply vs (n X 17) * (n) x 100 [vectors] """

	b = numbapro.cudalib.cublas.Blas()
	G = np.array(np.random.randn(n, n), dtype=np.float32, order='F')
	G2 = np.array(np.random.randn(n, k), dtype=np.float32, order='F')
	d_G = cuda.to_device(G)
	d_G2 = cuda.to_device(G2)
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Beispiel #22
0
def getIdx(
    fill_word, reduced_literal, reduced_length, head, cardinality
):  #step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
    bin_length = max(len(bin(2 * reduced_length - 1)), len(
        bin(tpb - 1)))  #the bit number of binary form of array length
    thread_num = numpy.int64(
        math.pow(2, bin_length)
    )  #Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
    compact_flag = numpy.ones(thread_num, dtype='int64')
    print thread_num
    print reduced_length
    index = numpy.ones(2 * reduced_length, dtype='uint32')
    d_index = cuda.to_device(index)
    d_fill_word = cuda.to_device(fill_word)
    d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
    d_compact_flag = cuda.to_device(compact_flag)
    #print fill_word

    block_num = reduced_length / tpb + 1

    getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index,
                               d_compact_flag, reduced_length)
    compact_flag = d_compact_flag.copy_to_host()
    print 'compact:'
    print compact_flag[0:28]

    useless_array = numpy.zeros(thread_num, dtype='int64')
    radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
    out_index_length = d_compact_flag.copy_to_host()[2 * reduced_length -
                                                     1] + 1
    print d_compact_flag.copy_to_host()[0:2 * reduced_length]
    print out_index_length
    out_index = numpy.zeros(out_index_length, dtype='uint32')
    offset = []

    new_block_num = 2 * reduced_length / tpb + 1

    scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag,
                                      out_index, reduced_length)
    #for i in out_index:
    print head[-100:-1]
    for i in xrange(reduced_length):
        if head[i]:
            offset.append(d_compact_flag.copy_to_host()[2 * i])
    #print offset

    key_length = numpy.zeros(cardinality, dtype='int64')

    print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    print "cardinality:%d" % cardinality
    print "len(off_set)%d" % len(offset)
    for i in xrange(cardinality - 1):
        key_length[i] = offset[i + 1] - offset[i]
    key_length[cardinality - 1] = out_index_length - offset[cardinality - 1]
    print key_length

    return out_index, numpy.array(offset), numpy.array(key_length)
Beispiel #23
0
def get_indexList(path, attr_selected):
    path1, path2, attr_num = bitmap_pickle.get_pic_path(path)
    f1 = open(path1, 'rb')  # read data_map.pkl
    try:
        attr_map = pickle.load(f1)
        attr_list = pickle.load(f1)
        attr_total = pickle.load(f1)
    finally:
        f1.close()

    f2 = open(path2, 'rb')  # read bitmap_pic.pkl
    try:
        lists = pickle.load(f2)
        key = pickle.load(f2)
        offset = pickle.load(f2)
    finally:
        f2.close()

        # attr_input is a list that stores the numbers of input attributes
        # attr_num is the total number of attributes
        # attr_total is the total number of data/31
        attr_input = [[] for i in xrange(attr_num)]
        for i in xrange(attr_num):
            for attri in attr_selected[i]:
                if attri in attr_map[i]:
                    attr_input[i].append(attr_map[i][attri])
                elif attri == 'All':
                    attr_input[i].append(-1)
            if len(attr_input[i]) > 1 and (-1 in attr_input[i]):
                attr_input[i].remove(-1)
        print attr_input

    search_start_time = time.time()
    if len(attr_input
           ) != attr_num:  # there might be a wrong input in input_test.py
        print 'No eligible projects'
    else:
        tpb = 1024
        blocknum = 1
        attr_mul = (attr_total + (tpb * blocknum - 1)) / (tpb * blocknum)
        # attr_mul is the number that each thread need to be performed
        #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul)
        # attr_num = 1
        index_list = numpy.zeros(attr_total * 31, dtype='int32')
        bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key,
                               offset)
        stream = cuda.stream()
        d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream)
        d_index_list = cuda.to_device(numpy.array(index_list), stream)
        index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num,
                                         attr_total, attr_mul)
        index_list = d_index_list.copy_to_host()
        stream.synchronize()
    search_end_time = time.time()
    return index_list, search_end_time - search_start_time
Beispiel #24
0
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")    
    
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib.cublas import Blas  # @UnresolvedImport
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
            x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
            A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
def cuda_factor(number, primes):
    device = cuda.get_current_device()
    ffactor = np.asarray([1])
    dfact = cuda.to_device(ffactor)
    d_primes = cuda.to_device(np.asarray(primes))

    tpb = 720
    bpg = 334
    cu_fact[bpg, tpb](d_primes, number, dfact)
    c = dfact.copy_to_host()
    return c
def get_indexList(path, attr_selected):
    path1, path2, attr_num = bitmap_pickle.get_pic_path(path)
    f1 = open(path1, 'rb')  # read data_map.pkl
    try:
        attr_map = pickle.load(f1)
        attr_list = pickle.load(f1)
        attr_total = pickle.load(f1)
    finally:
        f1.close()

    f2 = open(path2, 'rb')  # read bitmap_pic.pkl
    try:
        lists = pickle.load(f2)
        key = pickle.load(f2)
        offset = pickle.load(f2)
    finally:
        f2.close()

    # attr_input is a list that stores the numbers of input attributes
    # attr_num is the total number of attributes
    # attr_total is the total number of data/31
	attr_input = [[] for i in xrange(attr_num)]
	for i in xrange(attr_num):
		for attri in attr_selected[i]:
			if attri in attr_map[i]:
				attr_input[i].append(attr_map[i][attri])
			elif attri == 'All':
				attr_input[i].append(-1)
		if len(attr_input[i])>1 and (-1 in attr_input[i]):
			attr_input[i].remove(-1)
	print attr_input

    search_start_time = time.time()
    if len(attr_input) != attr_num:  # there might be a wrong input in input_test.py
        print 'No eligible projects'
    else:
        tpb = 1024
        blocknum = 1
        attr_mul = (attr_total + (tpb * blocknum - 1))/(tpb * blocknum)
        # attr_mul is the number that each thread need to be performed
        #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul)
        # attr_num = 1
        index_list = numpy.zeros(attr_total*31, dtype='int32')
        bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset)
        stream = cuda.stream()
        d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream)
        d_index_list = cuda.to_device(numpy.array(index_list), stream)
        index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul)
        index_list = d_index_list.copy_to_host()
        stream.synchronize()
    search_end_time = time.time()
    return index_list, search_end_time-search_start_time
Beispiel #27
0
    def flush(self, metric_opt, supp_opt):
        if not self.Vcs:
            # Nothing to do
            return metric_opt, supp_opt

        k = self.k
        V = self.V

        topk_list = []

        nodect = V.shape[0]
        numseg = len(self.Vcs)
        assert nodect
        assert numseg
        eachsize = nodect * numseg
        D = np.zeros(eachsize, dtype=np.float32)

        # Fill buffer for segmented sort
        for i, Vc in enumerate(self.Vcs):
            D[i * nodect:(i + 1) * nodect] = Vc[:, 0]

        # Prepare for GPU segmented sort
        dD = cuda.to_device(D)
        dI = cuda.device_array((numseg, nodect), dtype=np.uint32)

        blksz = 32
        init_indices[(divup(dI.shape[0], blksz),
                      divup(dI.shape[1], blksz)),
                     (blksz, blksz)](dI)

        if numseg == 1:
            segments = np.arange(1, dtype=np.int32)
        else:
            segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect

        segmented_sort(dD, dI, cuda.to_device(segments))

        for i in range(numseg):
            topk = dI[i, -k:].copy_to_host()
            topk_list.append(topk)

        # Reduce
        for topk in topk_list:
            # Assume A is huge
            metric = np.linalg.norm(V[topk, :]) ** 2
            if metric > metric_opt:
                metric_opt = metric
                supp_opt = topk

        # Clear all Vc
        self.Vcs.clear()
        return metric_opt, supp_opt
Beispiel #28
0
def main():
    vort = np.array(np.random.rand(2 * n), dtype=dtype).reshape((n, 2))
    gamma = np.array(np.random.rand(n), dtype=dtype)
    vel = np.zeros_like(vort)
    start = timer()
    induced_velocity(vort, vort, gamma, vel)
    numpy_time = timer() - start
    print("n = %d" % n)
    print("Numpy".center(40, "="))
    print("Time: %f seconds" % numpy_time)

    vel2 = np.zeros_like(vort)
    start = timer()
    induced_velocity2(vort, vort, gamma, vel2)
    numba_time = timer() - start
    print("Numba".center(40, "="))
    print("Time: %f seconds" % numba_time)
    error = np.max(np.max(np.abs(vel2 - vel)))
    print("Difference: %f" % error)
    print("Speedup: %f" % (numpy_time / numba_time))

    stream = cuda.stream()
    d_vort = cuda.to_device(vort, stream)
    d_gamma = cuda.to_device(gamma, stream)
    vel3 = np.zeros_like(vort)
    d_vel = cuda.to_device(vel3, stream)
    # blockdim = (32,32)
    # griddim = (n // blockdim[0], n // blockdim[1])
    griddim = (n - 1) // blksize + 1
    start = timer()
    induced_velocity3[griddim, blksize, stream](d_vort, d_vort, d_gamma, d_vel)
    d_vel.to_host(stream)
    gpu_time = timer() - start
    error = np.max(np.max(np.abs(vel3 - vel)))
    print("GPU".center(40, "="))
    print("Time: %f seconds" % gpu_time)
    print("Difference: %f" % error)
    print("Speedup: %f" % (numpy_time / gpu_time))
    # print(vel3)

    vel4 = np.zeros_like(vort)
    d_vel2 = cuda.to_device(vel4, stream)
    start = timer()
    induced_velocity4[griddim, blksize, stream](d_vort, d_vort, d_gamma,
                                                d_vel2)
    d_vel2.to_host(stream)
    gpu2_time = timer() - start
    error = np.max(np.max(np.abs(vel4 - vel)))
    print("GPU smem".center(40, "="))
    print("Time: %f seconds" % gpu2_time)
    print("Difference: %f" % error)
    print("Speedup: %f" % (numpy_time / gpu2_time))
Beispiel #29
0
def make_fp_tree():
    #### Allocate host memory
    offsets, transactions, num_transactions, all_items_in_transactions = readFile(
        "data.txt")
    print num_transactions, all_items_in_transactions

    flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32)

    #### Allocate and initialize GPU/Device memory
    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)
    d_flist = cuda.to_device(flist)
    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) +
                        1, 1)

    t1 = time()
    makeFlistGPU[number_of_blocks,
                 threads_per_block](d_offsets, d_transactions, d_flist,
                                    num_transactions,
                                    all_items_in_transactions)
    cuda.synchronize()
    t2 = time()

    d_flist.copy_to_host(flist)
    cuda.synchronize()
    #
    # for i in range(0, MAX_UNIQUE_ITEMS):
    #     print i, flist[i]

    t3 = time()
    flist_cpu = makeFlist(transactions, all_items_in_transactions)
    t4 = time()
    #

    match = 1
    for i in range(1, MAX_UNIQUE_ITEMS):
        if i not in flist_cpu and flist[i] == 0:
            continue
        #print i, flist[i], flist_cpu[i]
        if flist[i] != flist_cpu[i]:
            match = -1
            break
    if match == 1:
        print "Test Passed"
    else:
        print "Test Failed"

    print "Number of transactions = ", num_transactions
    print "All items in transactions = ", all_items_in_transactions
    print "GPU time = ", t2 - t1
    print "CPU TIME = ", t4 - t3
Beispiel #30
0
def encode(word,gpu=True):
	if isinstance(word,basestring):
		if using_embeddings == True:
			return cuda.to_device(vocab[word])
		else:
			x = np.zeros((1,word_idx),dtype='float32')
			x[0][vocab[word]] = 1.
			if gpu == True:
				return cuda.to_device(x)
			else:
				return x
	else:
		return word
Beispiel #31
0
def main():

    flowtime = 0.1
    nx = 128
    ny = 128
    dx = 2.0 / (nx - 1)
    dy = 2.0 / (ny - 1)

    dt = dx / 50  ##ensures stability for a given mesh fineness

    rho = 1.0
    nu = .1

    nt = int(
        flowtime / dt
    )  ##calculate number of timesteps required to reach a specified total flowtime

    U = numpy.zeros((nx, ny), dtype=numpy.float32)
    U[-1, :] = 1
    V = numpy.zeros((nx, ny), dtype=numpy.float32)
    P = numpy.zeros((ny, nx), dtype=numpy.float32)
    UN = numpy.zeros((nx, ny), dtype=numpy.float32)
    VN = numpy.zeros((nx, ny), dtype=numpy.float32)

    griddim = nx, ny
    blockdim = 768, 768, 1
    #if nx > 767:
    #    griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0]))

    t1 = time.time()
    ###Target the GPU to begin calculation
    stream = cuda.stream()
    d_U = cuda.to_device(U, stream)
    d_V = cuda.to_device(V, stream)
    d_UN = cuda.to_device(UN, stream)
    d_VN = cuda.to_device(VN, stream)

    for i in range(nt):
        P = ppe(rho, dt, dx, dy, U, V, P)
        CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt,
                                         rho, nu)
        d_U.to_host(stream)
        d_V.to_host(stream)
        stream.synchronize()

    t2 = time.time()

    print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2 - t1)
    x = numpy.linspace(0, 2, nx)
    y = numpy.linspace(0, 2, ny)
    Y, X = numpy.meshgrid(y, x)
def produce_fill(reduced_input_data, reduced_chunk_id, reduced_length):#step 4
	head = numpy.ones(reduced_length, dtype='int32')
	stream = cuda.stream()
	d_head = cuda.to_device(head, stream)
	d_reduced_input_data = cuda.to_device(reduced_input_data, stream)
	produce_head[1,tpb](d_reduced_input_data, d_head, reduced_length)#produce head
	d_head.to_host(stream)
	stream.synchronize()
	d_reduced_chunk_id = cuda.to_device(reduced_chunk_id,stream)
	produce_fill_gpu[1,tpb](d_head, d_reduced_chunk_id, reduced_chunk_id, reduced_length)
	d_reduced_chunk_id.to_host(stream)
	stream.synchronize()
	#convert to int32 because the range a fill_word can describe is 0~(2^31-1)
	return numpy.array(reduced_chunk_id, dtype='int32')
Beispiel #33
0
def convolve():
    # Build Filter
    laplacian_pts = '''
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    '''.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    image = get_image()

    print "Image size: %s" % (image.shape,)

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    # Use SciPy to perform the FFT convolution
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode='same')
    te = timer()
    print 'CPU: %.2fs' % (te - ts)

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Initialize the cuFFT system.
    cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    d_image_complex = cuda.to_device(image_complex)
    d_response_complex = cuda.to_device(response_complex)

    task1(cufft, d_image_complex, d_response_complex)

    cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print 'GPU: %.2fs' % (te - ts)

    return cvimage_cpu, cvimage_gpu
Beispiel #34
0
def convolve():
    # Build Filter
    laplacian_pts = '''
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    '''.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    image = get_image()

    print "Image size: %s" % (image.shape, )

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    # Use SciPy to perform the FFT convolution
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode='same')
    te = timer()
    print 'CPU: %.2fs' % (te - ts)

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Initialize the cuFFT system.
    cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    d_image_complex = cuda.to_device(image_complex)
    d_response_complex = cuda.to_device(response_complex)

    task1(cufft, d_image_complex, d_response_complex)

    cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print 'GPU: %.2fs' % (te - ts)

    return cvimage_cpu, cvimage_gpu
Beispiel #35
0
def main():

    flowtime = 0.1
    nx = 128 
    ny = 128
    dx = 2.0/(nx-1)
    dy = 2.0/(ny-1)

    dt = dx/50 ##ensures stability for a given mesh fineness
    
    rho = 1.0
    nu =.1 

    nt = int(flowtime/dt) ##calculate number of timesteps required to reach a specified total flowtime

    U = numpy.zeros((nx,ny), dtype=numpy.float32)
    U[-1,:] = 1
    V = numpy.zeros((nx,ny), dtype=numpy.float32)
    P = numpy.zeros((ny, nx), dtype=numpy.float32)
    UN = numpy.zeros((nx,ny), dtype=numpy.float32)
    VN = numpy.zeros((nx,ny), dtype=numpy.float32)

    griddim = nx, ny
    blockdim = 768, 768, 1
    #if nx > 767:
    #    griddim = int(math.ceil(float(nx)/blockdim[0])), int(math.ceil(float(ny)/blockdim[0]))

    t1 = time.time()    
    ###Target the GPU to begin calculation
    stream = cuda.stream()
    d_U = cuda.to_device(U, stream)
    d_V = cuda.to_device(V, stream)
    d_UN = cuda.to_device(UN, stream)
    d_VN = cuda.to_device(VN, stream)

    for i in range(nt):
        P = ppe(rho, dt, dx, dy, U, V, P)
        CudaU[griddim, blockdim, stream](d_U, d_V, P, d_UN, d_VN, dx, dy, dt, rho, nu)
        d_U.to_host(stream)
        d_V.to_host(stream)
        stream.synchronize()

    t2 = time.time()

    print "Completed grid of %d by %d in %.6f seconds" % (nx, ny, t2-t1)
    x = numpy.linspace(0,2,nx)
    y = numpy.linspace(0,2,ny)
    Y,X = numpy.meshgrid(y,x)
    def symbolise(self, last_result=None):
        self.show_visualisation()
        bounds = np.empty(shape=(self.height * self.depth, 2), dtype=np.float64)
        for k in range(self.height):
            bounds[k * 3] = [0.001, 1]  # wavelength as a fraction of the window width
            bounds[k * 3 + 1] = [-1, 2]  # offset where window is 0,1
            bounds[k * 3 + 2] = [-1, 1]  # amplitude. Can be negative


        self._working_results = np.zeros(shape=(self.width, self.height), dtype=np.float64)
        # cuda init
        self._factors = np.zeros(shape=(self.height * self.depth,), dtype=np.float64)
        # cuda.profile_start()
        self._d_working_results = cuda.to_device(self._working_results)
        grid_dim = self.width / 128, self.height - 1
        block_dim = 128, 1, 1

        self.compute_samples_configured = compute_sample_gpu.configure(grid_dim, block_dim)

        with cuda.pinned(self._factors, self._working_results):
            output = scipy.optimize.differential_evolution(self.evaluation_function_cuda, bounds=bounds,
                                                           strategy='best2bin', maxiter=20, recombination=0.9,
                                                           mutation=(0.0001, 0.3), tol=0.01, init='latinhypercube',
                                                           popsize=15, disp=True, callback=self.step_callback)
        # cuda.profile_stop()

        self.evaluation_function_cuda(output.x)
        # raw_input("Press Enter to continue...")
        print(output)
Beispiel #37
0
def _check_array(a):
    """Checks whether array is valid for moving to gpu and moves data to gpu.

    Parameters
    ----------
    a : array-like
        Array to move to gpu
    """
    ok_dtypes = [np.int, np.float32, np.float64]
    if isinstance(a, np.ndarray):
        a = cuda.to_device(np.array(a, dtype=np.float32, order='F'))
    elif isinstance(a, cuda.cudadrv.devicearray.DeviceNDArray):
        pass
    else:
        a = np.array(a)
        if a.dtype not in ok_dtypes:
            raise ValueError('input of type '+str(a.dtype)+
                             ' is not supported')
        else:
            a = np.array(a,dtype=np.float32, order='F')
    if a.dtype == np.float32:
        out_dtype = a.dtype
    else:
        raise NotImplementedError
    return (a, out_dtype)
Beispiel #38
0
    def test_relu_m_v(self):
        """relu on matrices with newaxis vectors for thresholds."""
        a = self.rng.rand(129, 1025).astype(np.float32)
        t = self.rng.rand(1025).astype(np.float32)
        out_np = copy.deepcopy(a)
        out_np[out_np<t[np.newaxis,:]] = 0.
        out_gp = self.gp.relu(a, thresh=t[np.newaxis,:]).copy_to_host()
        assert(np.allclose(out_np, out_gp))

        a = self.rng.rand(129, 1025).astype(np.float32)
        t = self.rng.rand(129).astype(np.float32)
        val = .5
        out_np = copy.deepcopy(a)
        out_np[out_np<t[:,np.newaxis]] = val
        out_gp = self.gp.relu(a, thresh=t[:,np.newaxis], set_val=val).copy_to_host()
        assert(np.allclose(out_np, out_gp))

        a = self.rng.rand(129, 1025).astype(np.float32)
        t = self.rng.rand(129).astype(np.float32)
        val = .5
        out_np = copy.deepcopy(a)
        out_np[out_np<t[:,np.newaxis]] = val
        out_gp = cuda.to_device(a)
        self.gp.relu(a, thresh=t[:,np.newaxis], set_val=val, out=out_gp)
        assert(np.allclose(out_np, out_gp.copy_to_host()))
Beispiel #39
0
    def scal(self, a, alpha):
        """Scale a 1D or 2D array by alpha.

        Parameters
        ----------
        a : array-like
            Array to scale.
        alpha : float
            Scaling factor.
        """

        a, out_dtype = _check_array(a)

        a_dim = a.shape

        if a.ndim == 2:
            a_strides = a.strides
            a_dtype = a.dtype
            d_flat_a = _cu_reshape(a, (np.prod(a_dim),), (a_strides[0],), a_dtype)
            self.blas.scal(alpha, d_flat_a)
            a = _cu_reshape(d_flat_a, a_dim, a_strides, a_dtype)
        elif a.ndim == 1:
            if type(a) == np.ndarray:
                a = cuda.to_device(a)
            self.blas.scal(alpha, a)
        else:
            raise NotImplementedError

        return a
Beispiel #40
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
Beispiel #41
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
def make_fp_tree():
    #### Allocate host memory
    offsets, transactions, num_transactions, all_items_in_transactions = readFile("data.txt")
    print num_transactions, all_items_in_transactions

    flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32)

    #### Allocate and initialize GPU/Device memory
    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)
    d_flist = cuda.to_device(flist)
    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)

    t1 = time()
    makeFlistGPU [number_of_blocks, threads_per_block] (d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions)
    cuda.synchronize()
    t2 = time()

    d_flist.copy_to_host(flist)
    cuda.synchronize()
    #
    # for i in range(0, MAX_UNIQUE_ITEMS):
    #     print i, flist[i]

    t3 = time()
    flist_cpu = makeFlist(transactions, all_items_in_transactions)
    t4 = time()
    #

    match = 1
    for i in range(1, MAX_UNIQUE_ITEMS):
        if i not in flist_cpu and flist[i] == 0:
            continue
        #print i, flist[i], flist_cpu[i]
        if flist[i] != flist_cpu[i]:
            match = -1
            break
    if match == 1:
        print "Test Passed"
    else:
        print "Test Failed"

    print "Number of transactions = ", num_transactions
    print "All items in transactions = ", all_items_in_transactions
    print "GPU time = ", t2 - t1
    print "CPU TIME = ", t4 - t3
Beispiel #43
0
def zeros(n,gpu=True):
	w=None
	if gpu is True:
		w = np.zeros((n[0],n[1]),dtype='float32')
		w = cuda.to_device(w)
	else:
		w = np.zeros((n[0],n[1]))
	return(w)
Beispiel #44
0
def produce_fill(reduced_input_data, reduced_chunk_id,
                 reduced_length):  #step 4
    head = numpy.ones(reduced_length, dtype='int32')
    stream = cuda.stream()
    d_head = cuda.to_device(head, stream)
    d_reduced_input_data = cuda.to_device(reduced_input_data, stream)
    produce_head[1, tpb](d_reduced_input_data, d_head,
                         reduced_length)  #produce head
    d_head.to_host(stream)
    stream.synchronize()
    d_reduced_chunk_id = cuda.to_device(reduced_chunk_id, stream)
    produce_fill_gpu[1, tpb](d_head, d_reduced_chunk_id, reduced_chunk_id,
                             reduced_length)
    d_reduced_chunk_id.to_host(stream)
    stream.synchronize()
    #convert to int32 because the range a fill_word can describe is 0~(2^31-1)
    return numpy.array(reduced_chunk_id, dtype='int32')
Beispiel #45
0
def main():
    # Allocate host memory
    a = np.empty([N,N], dtype=np.float32)
    b = np.empty_like(a)
    c_cpu = np.empty_like(a)
    c_gpu = np.empty_like(a)

    # Initialize host memory
    for row in range(N):
        for col in range(N):
            a[row,col] = row
            b[row,col] = col+2
            c_cpu[row,col] = 0
            c_gpu[row,col] = 0

    # Allocate and initialize GPU/device memory
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c_gpu) # since we're overwriting c on the GPU in 
                                # the matrixMul kernel, no need to copy data over

    threads_per_block = (16, 16) # A 16 x 16 block threads
    number_of_blocks = ((N / threads_per_block[0]) + 1, (N / threads_per_block[1]) + 1)

    matrixMulGPU [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )

    d_c.copy_to_host(c_gpu)

    # Call the CPU version to check our work
    matrixMulCPU( a, b, c_cpu )

    # Compare the two answers to make sure they are equal
    error = False
    for row in range(N):
        if error:
            break
        for col in range(N):
            if error:
                break
            if c_cpu[row,col] != c_gpu[row,col]:
                print "FOUND ERROR at c[" + str(row) + "," + str(col) + "]"
                error = True

    if not error:
        print "Success!"
Beispiel #46
0
def infer(dictionary, coeffs, stimuli, eta, lamb, nIter, softThresh, adapt):
    #Get Blas routines
    bs = cublas.Blas()
    #Initialize arrays
    numDict = dictionary.shape[0]
    numStim = stimuli.shape[0]
    dataLength = stimuli.shape[1]
    d_u = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_s = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_b = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_ci = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_c = cuda.to_device(
        np.zeros((numDict, numDict), dtype=np.float32, order='F'))

    #Move inputs to GPU
    d_dictionary = cuda.to_device(
        np.array(dictionary, dtype=np.float32, order='F'))
    d_coeffs = cuda.to_device(np.array(coeffs, dtype=np.float32, order='F'))
    d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F'))

    blockdim2 = (32, 32)
    blockdim1 = 32
    #griddimc = (int(numDict/blockdim[0]),int(numDict/blockdim[1]))
    griddimcsub = int(numDict / blockdim1)
    griddimb = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1]))
    griddimi = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1]))

    #Calculate c: overlap of basis functions with each other minus identity
    #cinit[griddimc,blockdim](d_dictionary,d_c)
    bs.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary,
            d_dictionary, 0., d_c)
    csub[griddimcsub, blockdim1](d_c)
    #binit[griddimb,blockdim2](d_dictionary,d_stimuli,d_b)
    bs.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli,
            d_dictionary, 0., d_b)
    thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1)
    d_thresh = cuda.to_device(thresh)
    #Update u[i] and s[i] for nIter time steps
    for kk in xrange(nIter):
        #Calculate ci: amount other neurons are stimulated times overlap with rest of basis
        bs.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci)
        iter[griddimi, blockdim2](d_c, d_b, d_ci, d_u, d_s, eta, d_thresh,
                                  lamb, adapt, softThresh)
    u = d_u.copy_to_host()
    s = d_s.copy_to_host()
    return (s, u, thresh)
def main():
    # Allocate host memory
    a = np.empty([N,N], dtype=np.float32)
    b = np.empty_like(a)
    c_cpu = np.empty_like(a)
    c_gpu = np.empty_like(a)

    # Initialize host memory
    for row in range(N):
        for col in range(N):
            a[row,col] = row
            b[row,col] = col+2
            c_cpu[row,col] = 0
            c_gpu[row,col] = 0

    # Allocate and initialize GPU/device memory
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c_gpu) # since we're overwriting c on the GPU in 
                                # the matrixMul kernel, no need to copy data over  

    threads_per_block = (16, 16) # A 16 x 16 block threads
    number_of_blocks = ((N / threads_per_block[0]) + 1, (N / threads_per_block[1]) + 1)

    matrixMulGPU [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )

    d_c.copy_to_host(c_gpu)

    # Call the CPU version to check our work
    matrixMulCPU( a, b, c_cpu )

    # Compare the two answers to make sure they are equal
    error = False
    for row in range(N):
        if error:
            break
        for col in range(N):
            if error:
                break
            if c_cpu[row,col] != c_gpu[row,col]:
                print "FOUND ERROR at c[" + str(row) + "," + str(col) + "]"
                error = True

    if not error:
        print "Success!"
Beispiel #48
0
def cuda_factor(number, primes):
    device = cuda.get_current_device()
    ffactor = np.asarray([0] * len(primes))
    dfact = cuda.to_device(ffactor)
    d_primes = cuda.to_device(np.asarray(primes))

    tpb = 720
    bpg = 334
    start = timer()
    cu_fact[bpg, tpb](d_primes, number, dfact)
    total = timer() - start
    print "Time taken : ", total
    c = dfact.copy_to_host()
    k = []
    for d in c:
        if int(d) != 0:
            k.append(int(d))
    return k
Beispiel #49
0
    def compute_block(self):

        device_uniforms = curand.uniform(size=N * N, device=True)
        host_results = zeros((self.size, self.size))

        stream = cuda.stream()
        device_proposals = cuda.to_device(self.host_proposals, stream=stream)
        device_omegas = cuda.to_device(self.host_omegas, stream=stream)
        device_results = cuda.device_array_like(host_results, stream=stream)
        cu_one_block[self.grid_dim, self.threads_per_block,
                     stream](self.start, device_proposals, device_omegas,
                             device_uniforms, device_results, self.size,
                             self.size)
        device_results.copy_to_host(host_results, stream=stream)

        stream.synchronize()

        return host_results
Beispiel #50
0
    def backward(dY, cache):
        Xe = cache['Xe']
        generator_str = cache['generator_str']
        dWs = np.zeros(cache['Ws_shape'])
        gen_caches = cache['gen_caches']
        F = cache['F']
        dXe = np.zeros(Xe.shape)

        Generator = decodeGenerator(generator_str)
        dmmy, gen_cache = gen_caches[0]
        g_WLSTM = cuda.to_device(np.asfortranarray(gen_cache['WLSTM']))
        # backprop each item in the batch
        grads = {}
        dt1 = 0
        dt2 = 0
        t0 = time.time()
        for i in xrange(len(gen_caches)):
            t1 = time.time()
            ix, gen_cache = gen_caches[i]  # unpack
            local_grads = Generator.backward(dY[i], gen_cache, g_WLSTM)
            dt1 += time.time() - t1

            t2 = time.time()
            dXs = local_grads['dXs']  # intercept the gradients wrt Xi and Xs
            del local_grads['dXs']
            dXi = local_grads['dXi']
            del local_grads['dXi']
            accumNpDicts(
                grads,
                local_grads)  # add up the gradients wrt model parameters
            # now backprop from dXs to the image vector and word vectors
            dXe[i, :] += dXi  # image vector
            for n, j in enumerate(ix):  # and now all the other words
                dWs[j, :] += dXs[n, :]

            dt2 += time.time() - t2

            #dt = time.time() - t0
            #print 'BP :%0.4f' %(dt)

        dt = time.time() - t0
        print 'Backward Pass:%0.4f Others :%0.4f' % (dt1, dt2)
        t0 = time.time()
        # finally backprop into the image encoder
        dWe = F.transpose().dot(dXe)
        dbe = np.sum(dXe, axis=0, keepdims=True)

        dt = time.time() - t0
        print 'MMult :%0.4f' % (dt)
        t0 = time.time()

        accumNpDicts(grads, {'We': dWe, 'be': dbe, 'Ws': dWs})
        dt = time.time() - t0
        print 'accum 2:%0.4f' % (dt)
        t0 = time.time()
        return grads
Beispiel #51
0
def infer(learner, stimuli, coeffs=None):
    #Get Blas routines
    blas = cublas.Blas()
    #Initialize arrays
    numDict = learner.Q.shape[0]
    numStim = stimuli.shape[0]
    dataLength = stimuli.shape[1]
    u = np.zeros((numStim, numDict), dtype=np.float32, order='F')
    if coeffs is not None:
        u[:] = np.atleast_2d(coeffs)
    d_u = cuda.to_device(u)
    d_s = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_b = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_ci = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_c = cuda.to_device(
        np.zeros((numDict, numDict), dtype=np.float32, order='F'))

    #Move inputs to GPU
    d_dictionary = cuda.to_device(
        np.array(learner.Q, dtype=np.float32, order='F'))
    d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F'))

    blockdim2 = (32, 32)  # TODO: experiment, was all 32s
    blockdim1 = 32
    griddimcsub = int(ceil(numDict / blockdim1))
    griddimi = (int(ceil(numStim / blockdim2[0])),
                int(ceil(numDict / blockdim2[1])))

    #Calculate c: overlap of basis functions with each other minus identity
    blas.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary,
              d_dictionary, 0., d_c)
    LCALearner.csub[griddimcsub, blockdim1](d_c)
    blas.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli,
              d_dictionary, 0., d_b)
    thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1)
    d_thresh = cuda.to_device(thresh)
    #Update u[i] and s[i] for niter time steps
    for kk in range(learner.niter):
        #Calculate ci: amount other neurons are stimulated times overlap with rest of basis
        blas.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci)
        LCALearner.iterate[griddimi,
                           blockdim2](d_c, d_b, d_ci, d_u, d_s,
                                      learner.infrate, d_thresh,
                                      learner.min_thresh, learner.adapt,
                                      learner.softthresh)
    u = d_u.copy_to_host()
    s = d_s.copy_to_host()
    return s.T, u.T, thresh
Beispiel #52
0
def test_histogram():
    #Allocate host memory
    input_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    bins_h = np.zeros(BIN_SIZE, dtype=np.uint32)
    myprint("Bin Size = " + str(bins_h.size))
    ## Initialize host memory
    for i in range(0, NUM_ELEMENTS):
        input_h[i] = randint(0, BIN_SIZE - 1)

    ## Allocate and initialize GPU/device memory
    input_d = cuda.to_device(input_h)
    bins_d = cuda.to_device(bins_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)#((NUM_ELEMENTS / threads_per_block[0]) + 1, 1)

    t1 = time()
    histogramGPU [number_of_blocks, threads_per_block] (input_d, bins_d, NUM_ELEMENTS)
    cuda.synchronize()
    t2 = time()
    bins_d.copy_to_host(bins_h)

    t3 = time()
    bins_cpu = makeHist(input_h)
    t4 = time()

    # for i in range(0, BIN_SIZE):
    #     print i, bins_h[i], bins_cpu[i]

    print "GPU time = ", t2 - t1
    print "CPU TIME = ", t4 - t3

    match = 1
    for i in range(0, BIN_SIZE):
        if bins_h[i] != bins_cpu[i]:
            match = -1
            break
    if match == 1:
        print "Test Passed"
    else:
        print "Test Failed"
def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length - 1)), len(
        bin(TPB_MAX - 1)))  #the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2, bin_length))
    block_num = max(thread_num / TPB_MAX, 1)

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    for i in range(iter_num):
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list, stream)
        d_one_list = cuda.to_device(one_list, stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list,
                                     d_one_list)  #get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()

        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num,
                      base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0  #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]

        Blelloch_scan_caller(d_zero_list, d_one_list, base)

        array_adjust[block_num,
                     TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list,
                              d_zero_list, d_one_list, length)
Beispiel #54
0
def gradient_descent(X, Y, theta, alpha, num_iters):
    N = X.size
    NTID = 1024
    NBLK = N // NTID
    assert NBLK * NTID == N

    Ex = np.empty(NBLK, dtype=X.dtype)
    Ey = np.empty(NBLK, dtype=X.dtype)

    theta_x, theta_y = 0, 0

    # -----------------
    # GPU work

    dX = cuda.to_device(X)
    dY = cuda.to_device(Y)

    dEx = cuda.to_device(Ex, copy=False)
    dEy = cuda.to_device(Ey, copy=False)

    griddim = NBLK,
    blockdim = NTID,

    for _ in xrange(num_iters):
        cu_compute_error[griddim, blockdim](dX, dY, dEx, dEy, theta_x, theta_y)

        dEx.to_host()
        dEy.to_host()

        # -----------------
        # CPU work

        error_x = Ex.sum()
        error_y = Ey.sum()

        theta_x = theta_x - alpha * (1.0 / N) * error_x
        theta_y = theta_y - alpha * (1.0 / N) * error_y

    theta[0] = theta_x
    theta[1] = theta_y
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
Beispiel #56
0
def task2():
    a = numpy.float32(2.)  # Force value to be float32
    x = numpy.arange(NELEM, dtype='float32')
    y = numpy.arange(NELEM, dtype='float32')

    ### Task2 ###
    # a) Complete the memory transfer for x -> dx, y -> dy
    # b) Allocate device memory for dout
    # c) Transfer for out <- dout
    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dout = cuda.device_array_like(x)

    griddim = NUM_BLOCKS
    blockdim = NUM_THREADS
    saxpy[griddim, blockdim](a, dx, dy, dout)

    out = dout.copy_to_host()
    print "out =", out

    if numpy.allclose(a * x + y, out):
        print "Correct result"
    else:
        print "Incorrect result"
def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS - i - 1

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)
    temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    for bit_shift in range(0, 32):
        tk1 = time()
        #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift)
        preScan(out_d, in_d, NUM_ELEMENTS)
        tk2 = time()
        #print bit_shift, tk2 - tk1
        in_d = out_d
        out_d = temp_d
        temp_d = in_d

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()

    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = [NUM_ELEMENTS - i - 1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1
def device_controller(cid):
    cuda.select_device(cid)  # bind device to thread
    device = cuda.get_current_device()  # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY)

    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:  # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread))

    # real CUDA work
    d_data = cuda.to_device(data)  # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)  # compute inplace
    d_data.copy_to_host(data)  # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError