def table_from_cursor(cursor): from pytools import Table tbl = Table() tbl.add_row([column[0] for column in cursor.description]) for row in cursor: tbl.add_row(row) return tbl
def __str__(self): from pytools import Table tbl = Table() tbl.add_row(("p", "error")) for p, err in zip(self.orders, self.errors): tbl.add_row((str(p), str(err))) return str(tbl)
def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15,27): sz = 1 << ex print sz from pycuda.curandom import rand as curand a_gpu = curand((sz,)) b_gpu = curand((sz,)) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl(a_gpu, b_gpu, #krnl(a_gpu, kernel_wrapper=wrap_with_timer) bytes = a_gpu.nbytes*2*cnt secs = elapsed[0]*1e-3 tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9)) print tbl
def main(): from pytools import Table tbl = Table() tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]")) import pycuda.gpuarray as gpuarray # they're floats, i.e. 4 bytes each for power in range(10, 28): size = 1<<power print(size) a = gpuarray.empty((size,), dtype=numpy.float32) b = gpuarray.empty_like(a) a.fill(1) b.fill(2) if power > 20: count = 10 else: count = 100 elapsed = [0] def add_timer(_, time): elapsed[0] += time() for i in range(count): a.mul_add(1, b, 2, add_timer) bytes = a.nbytes*count*3 bytes = a.nbytes*count*3 tbl.add_row((a.nbytes/(1<<20), elapsed[0]/count, bytes/elapsed[0]/1e9)) print(tbl)
def pretty_print(self, abscissa_label="N", error_label="Error", gliding_mean=2): from pytools import Table tbl = Table() tbl.add_row((abscissa_label, error_label, "Running EOC")) gm_eoc = self.estimate_order_of_convergence(gliding_mean) for i, (absc, err) in enumerate(self.history): if i < gliding_mean-1: tbl.add_row((str(absc), str(err), "")) else: tbl.add_row((str(absc), str(err), str(gm_eoc[i-gliding_mean+1,1]))) if len(self.history) > 1: return str(tbl) + "\n\nOverall EOC: %s" % self.estimate_order_of_convergence()[0,1] else: return str(tbl)
def main(): from pytools import Table tbl = Table() tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]")) import pycuda.gpuarray as gpuarray # they're floats, i.e. 4 bytes each for power in range(10, 28): size = 1 << power print(size) a = gpuarray.empty((size, ), dtype=numpy.float32) b = gpuarray.empty_like(a) a.fill(1) b.fill(2) if power > 20: count = 10 else: count = 100 elapsed = [0] def add_timer(_, time): elapsed[0] += time() for i in range(count): a.mul_add(1, b, 2, add_timer) bytes = a.nbytes * count * 3 bytes = a.nbytes * count * 3 tbl.add_row((a.nbytes / (1 << 20), elapsed[0] / count, bytes / elapsed[0] / 1e9)) print(tbl)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1 << power print size sizes.append(size) a = gpuarray.zeros((size, ), dtype=numpy.float32) if power > 20: count = 100 else: count = 1000 #start timer start = drv.Event() end = drv.Event() start.record() #cuda operation which fills the array with random numbers for i in range(count): curandom.rand((size, )) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) * 1e-3 times.append(secs / count) flops.append(size) #cpu operations which fills teh array with random data a = numpy.array((size, ), dtype=numpy.float32) #start timer start = drv.Event() end = drv.Event() start.record() #cpu operation which fills the array with random data for i in range(count): numpy.random.rand(size).astype(numpy.float32) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) * 1e-3 #add results to variable timesCPU.append(secs / count) flopsCPU.append(size) #calculate pseudo flops flops = [f / t for f, t in zip(flops, times)] flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)] #print the data out tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup")) for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU): tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu)) print tbl
def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15, 27): sz = 1 << ex print(sz) from pycuda.curandom import rand as curand a_gpu = curand((sz, )) b_gpu = curand((sz, )) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl( a_gpu, b_gpu, # krnl(a_gpu, kernel_wrapper=wrap_with_timer, ) bytes = a_gpu.nbytes * 2 * cnt secs = elapsed[0] * 1e-3 tbl.add_row(( str(dtype_out), a_gpu.nbytes / (1 << 20), elapsed[0] / cnt, bytes / secs / 1e9, )) print(tbl)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1 << power print(size) sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a + b end.record() end.synchronize() secs = start.time_till(end) * 1e-3 times_gpu.append(secs / count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) # start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs / count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)] flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)] from pytools import Table tbl = Table() tbl.add_row( ( "Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup", ) ) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu)) print(tbl)
def main(): drv.init() assert drv.Device.count() >= 1 ctx = drv.Device(0).make_context() import pycuda.gpuarray as gpuarray # make sure all the kernels are compiled gpuarray.GPUArray.compile_kernels() print "done compiling" sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1 << power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # start timer start = drv.Event() end = drv.Event() start.record() # cuda operation which adds two arrays over count time to get an average for i in range(count): a + b # stop timer end.record() end.synchronize() # calculate used time secs = start.time_till(end) * 1e-3 times.append(secs / count) flops.append(size) # cpu operations which adds two arrays aCpu = numpy.random.randn(size).astype(numpy.float32) bCpu = numpy.random.randn(size).astype(numpy.float32) # start timer start = drv.Event() end = drv.Event() start.record() # cpu operation which adds two arrays over count time to get an average for i in range(count): aCpu + bCpu # stop timer end.record() end.synchronize() # calculate used time secs = start.time_till(end) * 1e-3 # add results to variable timesCPU.append(secs / count) flopsCPU.append(size) # calculate pseudo flops flops = [f / t for f, t in zip(flops, times)] flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)] # print the data out from pytools import Table tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup")) for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU): tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu)) print tbl
def __call__(self, eval_dependency, lift_plan): discr = self.discr fplan = self.plan given = fplan.given elgroup, = discr.element_groups all_fluxes_on_faces = [gpuarray.empty( given.matmul_preimage_shape(lift_plan), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes))] fdata = self.flux_face_data_block(elgroup) ilist_data = self.index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=False) for dep_expr in self.all_deps: dep_field = eval_dependency(dep_expr) from hedge.tools import is_zero if is_zero(dep_field): if dep_expr in self.dep_to_tag: dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) else: dep_field = discr.volume_zeros() assert dep_field.dtype == given.float_type dep_field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((10000,), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() if discr.instrumented: discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) )) discr.gmem_bytes_gather.add( len(discr.blocks) * fdata.block_bytes + given.float_size() * ( # fetch len(self.fluxes) * 2*fdata.fp_count * fplan.dofs_per_face # store + len(discr.blocks) * len(self.fluxes) * fplan.microblocks_per_block() * fplan.aligned_face_dofs_per_microblock() )) else: gather.prepared_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) ) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: copied_debugbuf = debugbuf.get() print "DEBUG", len(discr.blocks) numpy.set_printoptions(linewidth=130) #print numpy.reshape(copied_debugbuf, (32, 16)) print copied_debugbuf[:50] #for i in range(len(discr.blocks)*6): #print i, copied_debugbuf[i*16:(i+1)*16] #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] wait_for_keypress(discr) if "cuda_flux" in discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) if True: cols = [] for k in range(len(all_fluxes_on_faces)): my_fof = all_fluxes_on_faces[k].get() def sstruc(a): result = "" for i in a: if i == 0: result += "0" elif abs(i) < 1e-10: result += "-" elif numpy.isnan(i): result += "N" elif i == 17: result += "*" else: result += "#" return result useful_sz = given.block_count \ * given.microblocks_per_block \ * lift_plan.aligned_preimage_dofs_per_microblock my_col = [] i = 0 while i < useful_sz: my_col.append(sstruc(my_fof[i:i+16])) i += 16 cols.append(my_col) from pytools import Table tbl = Table() tbl.add_row(["num"]+range(len(cols))) i = 0 for row in zip(*cols): tbl.add_row((i,)+row) i += 1 print tbl else: for i in range(len(all_fluxes_on_faces)): print i print all_fluxes_on_faces[i].get() wait_for_keypress(discr) #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] return all_fluxes_on_faces
def main(): import pycuda.gpuarray as gpuarray sizes = [] times = [] flops = [] flopsCPU = [] timesCPU = [] for power in range(10, 25): # 24 size = 1<<power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) if power > 20: count = 100 else: count = 1000 #start timer start = drv.Event() end = drv.Event() start.record() #cuda operation which fills the array with random numbers for i in range(count): curandom.rand((size, )) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end)*1e-3 times.append(secs/count) flops.append(size) #cpu operations which fills teh array with random data a = numpy.array((size,), dtype=numpy.float32) #start timer start = drv.Event() end = drv.Event() start.record() #cpu operation which fills the array with random data for i in range(count): numpy.random.rand(size).astype(numpy.float32) #stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end)*1e-3 #add results to variable timesCPU.append(secs/count) flopsCPU.append(size) #calculate pseudo flops flops = [f/t for f, t in zip(flops,times)] flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)] #print the data out tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU): tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu)) print tbl
#stop timer end.record() end.synchronize() #calculate used time secs = start.time_till(end) return secs #iterate over all methods and time the execution time with different array sizes print "compile kernels" kernel._compile_kernels(kernel) #generate our output table, one for gpu, one for cpu tblCPU = Table() tblGPU = Table() tblSPD = Table() #contains all the method names methods = ["size"] for name in dir(cuma): if (name.startswith("__") and name.endswith("__")) == False: method = getattr(cuma, name) if type(method) == types.FunctionType: methods.append(name) tblCPU.add_row(methods) tblGPU.add_row(methods)
def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1<<power print size sizes.append(size) a = gpuarray.zeros((size,), dtype=numpy.float32) b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) if power > 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a+b end.record() end.synchronize() secs = start.time_till(end)*1e-3 times_gpu.append(secs/count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) #start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs/count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)] flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)] from pytools import Table tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu)) print tbl
def __call__(self, eval_dependency, lift_plan): discr = self.discr fplan = self.plan given = fplan.given elgroup, = discr.element_groups all_fluxes_on_faces = [ gpuarray.empty(given.matmul_preimage_shape(lift_plan), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes)) ] fdata = self.flux_face_data_block(elgroup) ilist_data = self.index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=False) for dep_expr in self.all_deps: dep_field = eval_dependency(dep_expr) from hedge.tools import is_zero if is_zero(dep_field): if dep_expr in self.dep_to_tag: dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) else: dep_field = discr.volume_zeros() assert dep_field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % ( dep_expr, dep_field.dtype, given, given.float_type) dep_field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((10000, ), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() if discr.instrumented: discr.flux_gather_timer.add_timer_callable( gather.prepared_timed_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces))) discr.gmem_bytes_gather.add( len(discr.blocks) * fdata.block_bytes + given.float_size() * ( # fetch len(self.fluxes) * 2 * fdata.fp_count * fplan.dofs_per_face # store + len(discr.blocks) * len(self.fluxes) * fplan.microblocks_per_block() * fplan.aligned_face_dofs_per_microblock())) else: gather.prepared_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces)) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: copied_debugbuf = debugbuf.get() print "DEBUG", len(discr.blocks) numpy.set_printoptions(linewidth=130) #print numpy.reshape(copied_debugbuf, (32, 16)) print copied_debugbuf[:50] #for i in range(len(discr.blocks)*6): #print i, copied_debugbuf[i*16:(i+1)*16] #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] wait_for_keypress(discr) if "cuda_flux" in discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) if True: cols = [] for k in range(len(all_fluxes_on_faces)): my_fof = all_fluxes_on_faces[k].get() def sstruc(a): result = "" for i in a: if i == 0: result += "0" elif abs(i) < 1e-10: result += "-" elif numpy.isnan(i): result += "N" elif i == 17: result += "*" else: result += "#" return result useful_sz = given.block_count \ * given.microblocks_per_block \ * lift_plan.aligned_preimage_dofs_per_microblock my_col = [] i = 0 while i < useful_sz: my_col.append(sstruc(my_fof[i:i + 16])) i += 16 cols.append(my_col) from pytools import Table tbl = Table() tbl.add_row(["num"] + range(len(cols))) i = 0 for row in zip(*cols): tbl.add_row((i, ) + row) i += 1 print tbl else: for i in range(len(all_fluxes_on_faces)): print i print all_fluxes_on_faces[i].get() wait_for_keypress(discr) #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] return all_fluxes_on_faces