def test2(): """ This function is aimed at testing the performance of initializing the CUDA kernel with differing values of blocks per grid and threads per block. Careful: It is very unstable and will most likely crash your display driver unless you set safe ranges. If you're on Windows 7 or newer, it will just be restarted, however, but you will need to restart the Python session to restart the device hook from NumbaPro if you're running this interactively. TODO: Generate a better visualization for this -- 3d plots are bad... """ n = 50e6 # See test1() for why this is an unbiased genome size trials = 20 w = 16 # window size pssm = np.random.rand(4 * w) # generate PSSM bpg_range = (16, cuda.get_current_device().MAX_GRID_DIM_X*0.5) tpb_range = (32, cuda.get_current_device().MAX_BLOCK_DIM_X*0.5) data = [] for t in range(trials): for bpg_exp in range(int(math.ceil(math.log(bpg_range[0], 2))), int(math.ceil(math.log(bpg_range[1], 2)) + 1)): bpg = 2 ** bpg_exp for tpb_exp in range(int(math.ceil(math.log(tpb_range[0], 2))), int(math.ceil(math.log(tpb_range[1], 2)) + 1)): tpb = 2 ** tpb_exp print t, bpg, tpb seq = np.random.randint(0, 3, int(n)) __, __, run_info = score_sequence(seq, pssm, False, True, bpg, tpb) data.append((bpg, tpb, run_info["genome_size"] / run_info["runtime"])) x = [pt[0] for pt in data] y = [pt[1] for pt in data] z = [pt[2] for pt in data] xlabel('Blocks per Grid'), ylabel('Threads per Block')
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> step_cfg = step[gridsz, blksz, stream] d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last, stream=stream) d_last = d_paths stream.synchronize()
def gpumulti(X,mu): device = cuda.get_current_device() n=len(X) X=np.array(X) x1 = np.array(X.T[0]) x2 = np.array(X.T[1]) bmk = np.arange(len(x1)) mu = np.array(mu) dx1 = cuda.to_device(x1) dx2 = cuda.to_device(x2) dmu = cuda.to_device(mu) dbmk = cuda.to_device(bmk) # Set up enough threads for kernel tpb = device.WARP_SIZE bpg = int(np.ceil(float(n)/tpb)) cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk) bestmukey = dbmk.copy_to_host() return bestmukey
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def __init__(self, shape, dtype, prealloc): self.device = cuda.get_current_device() self.freelist = deque() self.events = {} for i in range(prealloc): gpumem = cuda.device_array(shape=shape, dtype=dtype) self.freelist.append(gpumem) self.events[gpumem] = cuda.event(timing=False)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def cuda_factor(number, primes): device = cuda.get_current_device() ffactor = np.asarray([1]) dfact = cuda.to_device(ffactor) d_primes = cuda.to_device(np.asarray(primes)) tpb = 720 bpg = 334 cu_fact[bpg, tpb](d_primes, number, dfact) c = dfact.copy_to_host() return c
def cuda_factor(number, primes): device = cuda.get_current_device() ffactor = np.asarray([0] * len(primes)) dfact = cuda.to_device(ffactor) d_primes = cuda.to_device(np.asarray(primes)) tpb = 720 bpg = 334 start = timer() cu_fact[bpg, tpb](d_primes, number, dfact) total = timer() - start print "Time taken : ", total c = dfact.copy_to_host() k = [] for d in c: if int(d) != 0: k.append(int(d)) return k
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print( prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY ) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print( prefix, 'grid x thread = %d x %d' % (ngrid, nthread) ) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread)) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
def mc_cuda(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # instantiate a CUDA stream for queueing async CUDA cmds stream = cuda.stream() # instantiate a cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # configure the kernel # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>> step_cfg = step_cuda[gridsz, blksz, stream] # transfer the initial prices d_last = cuda.to_device(paths[:, 0], stream=stream) for j in range(1, paths.shape[1]): # call cuRAND to populate d_normdist with gaussian noises prng.normal(d_normdist, mean=0, sigma=1) # setup memory for new prices # device_array_like is like empty_like for GPU d_paths = cuda.device_array_like(paths[:, j], stream=stream) # invoke step kernel asynchronously step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) # transfer memory back to the host d_paths.copy_to_host(paths[:, j], stream=stream) d_last = d_paths # wait for all GPU work to complete stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [ int(math.ceil(float(partlen) / blksz)) for partlen in partlens ] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [ curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist ] # Allocate device side array d_normlist = [ cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist) ] c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [ cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist) ] d_lastlist = [ cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [ cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def main(): device = cuda.get_current_device() maxtpb = device.MAX_THREADS_PER_BLOCK warpsize = device.WARP_SIZE # benchmark loop vary_warpsize = [] baseline = [] ilpx2 = [] ilpx4 = [] ilpx8 = [] # For OSX 10.8 where the GPU is used for graphic as well, # increasing the following to 10 * 2 ** 20 seems to be necessary to # produce consistent result. approx_data_size = 1.5 * 2**20 for multiplier in range(1, maxtpb // warpsize + 1): blksz = warpsize * multiplier gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8) print 'kernel config [%d, %d]' % (gridsz, blksz) N = blksz * gridsz A = np.arange(N, dtype=np.float32) B = np.arange(N, dtype=np.float32) print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize) dA = cuda.to_device(A) dB = cuda.to_device(B) assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz) vary_warpsize.append(blksz) dC = cuda.device_array_like(A) basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC)) expected_result = dC.copy_to_host() if basetime > 0: baseline.append(N / basetime) dC = cuda.device_array_like(A) x2time = time_this(vec_add_ilp_x2, gridsz // 2, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x2time > 0: ilpx2.append(N / x2time) dC = cuda.device_array_like(A) x4time = time_this(vec_add_ilp_x4, gridsz // 4, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x4time > 0: ilpx4.append(N / x4time) dC = cuda.device_array_like(A) x8time = time_this(vec_add_ilp_x8, gridsz // 8, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x8time > 0: ilpx8.append(N / x8time) pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline') pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2') pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4') pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8') pylab.legend(loc=4) pylab.title(cuda.get_current_device().name) pylab.xlabel('block size') pylab.ylabel('float per second') pylab.show()
@cuda.jit('void(float32[:], float32[:], float32[:])') def cu_add(a, b, c): # i = cuda.grid(1) tx = cuda.threadIdx.x bx = cuda.blockIdx.x bw = cuda.blockDim.x i = tx + bx * bw if i > c.size: return c[i] = a[i] + b[i] if __name__ == '__main__': gpu = cuda.get_current_device() n = 100 a = np.arange(n, dtype=np.float32) b = np.arange(n, dtype=np.float32) c = np.empty_like(a) nthreads = gpu.WARP_SIZE nblocks = int(np.ceil(float(n) / nthreads)) print 'Blocks per grid:', nblocks print 'Threads per block', nthreads cu_add[nblocks, nthreads](a, b, c) print c
def score_sequence(seq, pssm, verbose = False, keep_strands = True, benchmark = False, blocks_per_grid = -1, threads_per_block = -1): """ This function will score a sequence of nucleotides based on a PSSM by using a sliding window parallelized on a GPU. Args: seq: This must be an integer representation of the nucleotide sequence, where the alphabet is (A = 0, C = 1, G = 2, T = 3). It must be a vector (1D array) of integers that can be cast to int32 (See: numpy.int32). pssm: This must a vectorized PSSM where every four elements correspond to one position. Make sure this can be cast to an array of float64. verbose: Set this to True to print performance information. benchmark: If set to True, the function will return information about the run in a dictionary at the third output variable. keep_strands: Whether memory should be allocated for storing which strand the scores come from. Set this to False if you just want the scores and the strands array will not be returned. NOTE: If this and benchmark are set to False, then the scores will not be returned in a tuple, meaning: >>> score_sequence blocks_per_grid: This is the blocks per grid that will be assigned to the CUDA kernel. See this SO question for info on choosing this value: http://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid It defaults to the length of the sequence or the maximum number of blocks per grid supported by the GPU, whichever is lower. Set this to a negative number threads_per_block: Threads per block. See above. It defaults to 55% of the maximum number of threads per block supported by the GPU, a value determined experimentally. Higher values will likely result in failure to allocate resources to the kernel (since there will not be enough register space for each thread). Returns: scores: 1D float64 array of length (n - w + 1), where n is the length of the sequence and w is the window size. The value at index i of this array corresponds to the score of the n-mer at position i in the sequence. strands: 1D int32 array of length (n - w + 1). The value at position i is either 0 or 1 corresponding to the strand of the score at that position where 0 means the forward strand and 1 means reverse. run_info: This is a dictionary that is returned if the benchmark parameter is set to True. It contains the following: >>> run_info.keys() ['memory_used', 'genome_size', 'runtime', 'threads_per_block', 'blocks_per_grid'] Note that the memory_used is rather misleading if running the function more than once. CUDA is optimized to not transfer the same data from the host to the device so it will not always change. It may also unload other assets from memory, so the memory changed can be negative. TODO: Find a better method of calculating memory usage. Example: >>> pssm = np.random.uniform(-7.5, 2.0, 4 * 16) # Window size of 16 >>> seq = np.random.randint(0, 3, 30e6) # Generate random 30 million bp sequence >>> scores, strands, run_info = score_sequence(seq, pssm, benchmark=True, verbose=True) Threads per block = 563 Blocks per grid = 53286 Total threads = 30000018 Scoring... Done. Genome size: 3e+07 bp Time: 605.78 ms Speed: 4.95229e+07 bp/sec >>> scores array([-16.97089798, -33.48925866, -21.80381526, ..., -10.27919401, -32.64575614, -23.97110103]) >>> strands array([1, 1, 1, ..., 1, 1, 0]) >>> run_info {'memory_used': 426508288L, 'genome_size': 30000000, 'runtime': 0.28268090518054123, 'threads_per_block': 563, 'blocks_per_grid': 53286} A more interesting interpretation of the run information for performance analysis is the number of bases score per second: >>> print "%g bases/sec" % run_info["genome_size"] / run_info["runtime"] 1.06127e+08 bases/sec """ w = int(pssm.size / 4) # width of PSSM n = int(seq.size) # length of the sequence being scored # Calculate the reverse-complement of the PSSM pssm_r = np.array([pssm[i / 4 + (3 - (i % 4))] for i in range(pssm.size)][::-1]) # Calculate the appropriate threads per block and blocks per grid if threads_per_block <= 0 or blocks_per_grid <= 0: # We don't use the max number of threads to avoid running out of # register space by saturating the streaming multiprocessors # ~55% was found empirically, but your mileage may vary with different GPUs threads_per_block = int(cuda.get_current_device().MAX_BLOCK_DIM_X * 0.55) # We saturate our grid and let the dynamic scheduler assign the blocks # to the discrete CUDA cores/streaming multiprocessors blocks_per_grid = int(math.ceil(float(n) / threads_per_block)) if blocks_per_grid > cuda.get_current_device().MAX_GRID_DIM_X: blocks_per_grid = cuda.get_current_device().MAX_GRID_DIM_X if verbose: print "Threads per block = %d" % threads_per_block print "Blocks per grid = %d" % blocks_per_grid print "Total threads = %d" % (threads_per_block * blocks_per_grid) # Collect benchmarking info s = default_timer() start_mem = cuda.current_context().get_memory_info()[0] # Start a stream stream = cuda.stream() # Copy data to device d_pssm = cuda.to_device(pssm.astype(np.float64), stream) d_pssm_r = cuda.to_device(pssm_r.astype(np.float64), stream) d_seq = cuda.to_device(seq.astype(np.int32), stream) # Allocate memory on device to store results d_scores = cuda.device_array(n - w + 1, dtype=np.float64, stream=stream) if keep_strands: d_strands = cuda.device_array(n - w + 1, dtype=np.int32, stream=stream) # Run the kernel if keep_strands: cuda_score[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores, d_strands) else: cuda_score_without_strands[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores) # Copy results back to host scores = d_scores.copy_to_host(stream=stream) if keep_strands: strands = d_strands.copy_to_host(stream=stream) stream.synchronize() # Collect benchmarking info end_mem = cuda.current_context().get_memory_info()[0] t = default_timer() - s # Output info on the run if verbose parameter is true if verbose: print "Genome size: %g bp" % n print "Time: %.2f ms (using time.%s())" % (t * 1000, default_timer.__name__) print "Speed: %g bp/sec" % (n / t) print "Global memory: %d bytes used (%.2f%% of total)" % \ (start_mem - end_mem, float(start_mem - end_mem) * 100 / cuda.get_current_device().get_context().get_memory_info()[1]) # Return the run information for benchmarking run_info = {"genome_size": n, "runtime": t, "memory_used": start_mem - end_mem, \ "blocks_per_grid": blocks_per_grid, "threads_per_block": threads_per_block} # I'm so sorry BDFL, please don't hunt me down for returning different size # tuples in my function if keep_strands: if benchmark: return (scores, strands, run_info) else: return (scores, strands) else: if benchmark: return (scores, run_info) else: # Careful! This won't return a tuple, so you don't need to do # score_sequence[0] to get the scores return scores
def main(): device = cuda.get_current_device() maxtpb = device.MAX_THREADS_PER_BLOCK warpsize = device.WARP_SIZE # benchmark loop vary_warpsize = [] baseline = [] ilpx2 = [] ilpx4 = [] ilpx8 = [] # For OSX 10.8 where the GPU is used for graphic as well, # increasing the following to 10 * 2 ** 20 seems to be necessary to # produce consistent result. approx_data_size = 1.5 * 2**20 for multiplier in range(1, maxtpb // warpsize + 1): blksz = warpsize * multiplier gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8) print 'kernel config [%d, %d]' % (gridsz, blksz) N = blksz * gridsz A = np.arange(N, dtype=np.float32) B = np.arange(N, dtype=np.float32) print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize) dA = cuda.to_device(A) dB = cuda.to_device(B) assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz) vary_warpsize.append(blksz) dC = cuda.device_array_like(A) basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC)) expected_result = dC.copy_to_host() if basetime > 0: baseline.append(N / basetime) dC = cuda.device_array_like(A) x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x2time > 0: ilpx2.append(N / x2time) dC = cuda.device_array_like(A) x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x4time > 0: ilpx4.append(N / x4time) dC = cuda.device_array_like(A) x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC)) assert np.allclose(expected_result, dC.copy_to_host()) if x8time > 0: ilpx8.append(N / x8time) pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline') pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2') pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4') pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8') pylab.legend(loc=4) pylab.title(cuda.get_current_device().name) pylab.xlabel('block size') pylab.ylabel('float per second') pylab.show()
## ## from: https://people.duke.edu/~ccc14/sta-663/CUDAPython.html#more-examples ## from numbapro import cuda, vectorize, guvectorize, check_cuda from numbapro import void, uint8 , uint32, uint64, int32, int64, float32, float64, f8 import numpy as np from timeit import default_timer as timer import numbapro.cudalib.cublas as cublas check_cuda() device = cuda.get_current_device() ### naive matrix multiplication """ x1 = np.random.random((4,4)) x2 = np.random.random((4,4)) np.dot(x1, x2).shape """ ### Kernel function (no shared memory) @cuda.jit('void(float32[:,:], float32[:,:], float32[:,:], int32)') def cu_matmul(a, b, c, n): x, y = cuda.grid(2)