def __call__(self, op_class, field): discr = self.discr given = self.plan.given d = discr.dimensions elgroup, = discr.element_groups block, func = self.get_kernel(op_class, elgroup) assert field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % (field, field.dtype, given, given.float_type) use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug if use_debugbuf: debugbuf = gpuarray.zeros((512,), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() rst_diff = [discr.volume_empty() for axis in range(d)] rst_diff_gpudata = [subarray.gpudata for subarray in rst_diff] if discr.instrumented: discr.diff_op_timer.add_timer_callable( func.prepared_timed_call(self.grid, block, debugbuf.gpudata, field.gpudata, *rst_diff_gpudata)) block_gmem_floats = ( # matrix fetch given.microblock.aligned_floats * discr.dimensions * given.dofs_per_el() * self.plan.parallelism.serial * self.plan.parallelism.parallel # field fetch + given.microblock.aligned_floats * self.plan.parallelism.total() ) gmem_bytes = given.float_size() * ( self.grid[0] * block_gmem_floats # field store + len(discr.nodes)) discr.gmem_bytes_diff.add(gmem_bytes) else: func.prepared_call(self.grid, block, debugbuf.gpudata, field.gpudata, *rst_diff_gpudata) if use_debugbuf: copied_debugbuf = debugbuf.get() print "DEBUG" print field.shape #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16)) print copied_debugbuf raw_input() return rst_diff
def __call__(self, op_class, field): discr = self.discr given = self.plan.given d = discr.dimensions elgroup, = discr.element_groups block, func = self.get_kernel(op_class, elgroup) assert field.dtype == given.float_type use_debugbuf = set(["cuda_diff", "cuda_debugbuf"]) <= discr.debug if use_debugbuf: debugbuf = gpuarray.zeros((512,), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() rst_diff = [discr.volume_empty() for axis in range(d)] rst_diff_gpudata = [subarray.gpudata for subarray in rst_diff] if discr.instrumented: discr.diff_op_timer.add_timer_callable( func.prepared_timed_call(self.grid, block, debugbuf.gpudata, field.gpudata, *rst_diff_gpudata)) block_gmem_floats = ( # matrix fetch given.microblock.aligned_floats * discr.dimensions * given.dofs_per_el() * self.plan.parallelism.serial * self.plan.parallelism.parallel # field fetch + given.microblock.aligned_floats * self.plan.parallelism.total() ) gmem_bytes = given.float_size() * ( self.grid[0] * block_gmem_floats # field store + len(discr.nodes)) discr.gmem_bytes_diff.add(gmem_bytes) else: func.prepared_call(self.grid, block, debugbuf.gpudata, field.gpudata, *rst_diff_gpudata) if use_debugbuf: copied_debugbuf = debugbuf.get() print "DEBUG" print field.shape #print numpy.reshape(copied_debugbuf, (len(copied_debugbuf)//16, 16)) print copied_debugbuf raw_input() return rst_diff
def __call__(self, in_vector, prepped_mat, prepped_scaling, out_vector=None): discr = self.discr elgroup, = discr.element_groups given = self.plan.given kernel, in_vector_texref, scaling_texref = \ self.get_kernel(prepped_scaling is not None) if out_vector is None: out_vector = discr.volume_empty() in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True) if prepped_scaling is not None: prepped_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((1024,), dtype=given.float_type) else: debugbuf = FakeGPUArray() if discr.instrumented: discr.el_local_timer.add_timer_callable( kernel.prepared_timed_call( self.grid, out_vector.gpudata, prepped_mat, debugbuf.gpudata, len(discr.blocks)*given.microblocks_per_block, )) from pytools import product discr.gmem_bytes_el_local.add( given.float_size() * ( # matrix fetch self.plan.gpu_matrix_block_floats() * product(self.grid) # field fetch + self.plan.preimage_dofs_per_el * given.dofs_per_el() * given.microblock.elements * self.grid[1] * self.plan.parallelism.total() # field store + len(discr.nodes) )) else: kernel.prepared_call( self.grid, out_vector.gpudata, prepped_mat, debugbuf.gpudata, len(discr.blocks)*given.microblocks_per_block, ) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7)) print "DEBUG" numpy.set_printoptions(linewidth=100) copied_debugbuf.shape = (144,7) numpy.set_printoptions(threshold=3000) print copied_debugbuf raw_input() return out_vector
def __call__(self, in_vector, prepped_mat, out_vector=None): discr = self.discr elgroup, = discr.element_groups given = self.discr.given plan = self.plan kernel, block, mat_texref = self.get_kernel() mat_texref.set_array(prepped_mat) if out_vector is None: out_vector = discr.volume_empty() if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((1024,), dtype=self.plan.given.float_type) else: debugbuf = FakeGPUArray() if discr.instrumented: discr.el_local_timer.add_timer_callable( kernel.prepared_timed_call(self.grid, block, out_vector.gpudata, in_vector.gpudata, debugbuf.gpudata, plan.microblock_count, )) block_gmem_floats = ( # matrix fetch given.microblock.aligned_floats * plan.preimage_dofs_per_el * plan.parallelism.serial * plan.parallelism.parallel # field fetch + plan.preimage_dofs_per_el * plan.elements_per_microblock * plan.parallelism.total() ) gmem_bytes = given.float_size() * ( self.grid[0] * block_gmem_floats # field store + len(discr.nodes)) discr.gmem_bytes_el_local.add(gmem_bytes) else: kernel.prepared_call(self.grid, block, out_vector.gpudata, in_vector.gpudata, debugbuf.gpudata, plan.microblock_count, ) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: copied_debugbuf = debugbuf.get()[:144*7].reshape((144,7)) print "DEBUG" numpy.set_printoptions(linewidth=100) copied_debugbuf.shape = (144,7) numpy.set_printoptions(threshold=3000) print copied_debugbuf raw_input() return out_vector
def __call__(self, eval_dependency, lift_plan): discr = self.discr fplan = self.plan given = fplan.given elgroup, = discr.element_groups all_fluxes_on_faces = [gpuarray.empty( given.matmul_preimage_shape(lift_plan), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes))] fdata = self.flux_face_data_block(elgroup) ilist_data = self.index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=False) for dep_expr in self.all_deps: dep_field = eval_dependency(dep_expr) from hedge.tools import is_zero if is_zero(dep_field): if dep_expr in self.dep_to_tag: dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) else: dep_field = discr.volume_zeros() assert dep_field.dtype == given.float_type dep_field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((10000,), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() if discr.instrumented: discr.flux_gather_timer.add_timer_callable(gather.prepared_timed_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) )) discr.gmem_bytes_gather.add( len(discr.blocks) * fdata.block_bytes + given.float_size() * ( # fetch len(self.fluxes) * 2*fdata.fp_count * fplan.dofs_per_face # store + len(discr.blocks) * len(self.fluxes) * fplan.microblocks_per_block() * fplan.aligned_face_dofs_per_microblock() )) else: gather.prepared_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) ) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: copied_debugbuf = debugbuf.get() print "DEBUG", len(discr.blocks) numpy.set_printoptions(linewidth=130) #print numpy.reshape(copied_debugbuf, (32, 16)) print copied_debugbuf[:50] #for i in range(len(discr.blocks)*6): #print i, copied_debugbuf[i*16:(i+1)*16] #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] wait_for_keypress(discr) if "cuda_flux" in discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) if True: cols = [] for k in range(len(all_fluxes_on_faces)): my_fof = all_fluxes_on_faces[k].get() def sstruc(a): result = "" for i in a: if i == 0: result += "0" elif abs(i) < 1e-10: result += "-" elif numpy.isnan(i): result += "N" elif i == 17: result += "*" else: result += "#" return result useful_sz = given.block_count \ * given.microblocks_per_block \ * lift_plan.aligned_preimage_dofs_per_microblock my_col = [] i = 0 while i < useful_sz: my_col.append(sstruc(my_fof[i:i+16])) i += 16 cols.append(my_col) from pytools import Table tbl = Table() tbl.add_row(["num"]+range(len(cols))) i = 0 for row in zip(*cols): tbl.add_row((i,)+row) i += 1 print tbl else: for i in range(len(all_fluxes_on_faces)): print i print all_fluxes_on_faces[i].get() wait_for_keypress(discr) #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] return all_fluxes_on_faces
def __call__(self, eval_dependency, lift_plan): discr = self.discr fplan = self.plan given = fplan.given elgroup, = discr.element_groups all_fluxes_on_faces = [ gpuarray.empty(given.matmul_preimage_shape(lift_plan), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes)) ] fdata = self.flux_face_data_block(elgroup) ilist_data = self.index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=False) for dep_expr in self.all_deps: dep_field = eval_dependency(dep_expr) from hedge.tools import is_zero if is_zero(dep_field): if dep_expr in self.dep_to_tag: dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr]) else: dep_field = discr.volume_zeros() assert dep_field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % ( dep_expr, dep_field.dtype, given, given.float_type) dep_field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((10000, ), dtype=given.float_type) else: from hedge.backends.cuda.tools import FakeGPUArray debugbuf = FakeGPUArray() if discr.instrumented: discr.flux_gather_timer.add_timer_callable( gather.prepared_timed_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces))) discr.gmem_bytes_gather.add( len(discr.blocks) * fdata.block_bytes + given.float_size() * ( # fetch len(self.fluxes) * 2 * fdata.fp_count * fplan.dofs_per_face # store + len(discr.blocks) * len(self.fluxes) * fplan.microblocks_per_block() * fplan.aligned_face_dofs_per_microblock())) else: gather.prepared_call( (len(discr.blocks), 1), block, debugbuf.gpudata, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces)) if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: copied_debugbuf = debugbuf.get() print "DEBUG", len(discr.blocks) numpy.set_printoptions(linewidth=130) #print numpy.reshape(copied_debugbuf, (32, 16)) print copied_debugbuf[:50] #for i in range(len(discr.blocks)*6): #print i, copied_debugbuf[i*16:(i+1)*16] #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0] wait_for_keypress(discr) if "cuda_flux" in discr.debug: from hedge.tools import get_rank, wait_for_keypress if get_rank(discr) == 0: numpy.set_printoptions(linewidth=130, precision=2, threshold=10**6) if True: cols = [] for k in range(len(all_fluxes_on_faces)): my_fof = all_fluxes_on_faces[k].get() def sstruc(a): result = "" for i in a: if i == 0: result += "0" elif abs(i) < 1e-10: result += "-" elif numpy.isnan(i): result += "N" elif i == 17: result += "*" else: result += "#" return result useful_sz = given.block_count \ * given.microblocks_per_block \ * lift_plan.aligned_preimage_dofs_per_microblock my_col = [] i = 0 while i < useful_sz: my_col.append(sstruc(my_fof[i:i + 16])) i += 16 cols.append(my_col) from pytools import Table tbl = Table() tbl.add_row(["num"] + range(len(cols))) i = 0 for row in zip(*cols): tbl.add_row((i, ) + row) i += 1 print tbl else: for i in range(len(all_fluxes_on_faces)): print i print all_fluxes_on_faces[i].get() wait_for_keypress(discr) #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces] return all_fluxes_on_faces
def __call__(self, in_vector, prepped_mat, prepped_scaling, out_vector=None): discr = self.discr elgroup, = discr.element_groups given = self.plan.given kernel, in_vector_texref, scaling_texref = \ self.get_kernel(prepped_scaling is not None) if out_vector is None: out_vector = discr.volume_empty() in_vector.bind_to_texref_ext(in_vector_texref, allow_double_hack=True) if prepped_scaling is not None: prepped_scaling.bind_to_texref_ext(scaling_texref, allow_double_hack=True) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: debugbuf = gpuarray.zeros((1024, ), dtype=given.float_type) else: debugbuf = FakeGPUArray() if discr.instrumented: discr.el_local_timer.add_timer_callable( kernel.prepared_timed_call( self.grid, out_vector.gpudata, prepped_mat, debugbuf.gpudata, len(discr.blocks) * given.microblocks_per_block, )) from pytools import product discr.gmem_bytes_el_local.add(given.float_size() * ( # matrix fetch self.plan.gpu_matrix_block_floats() * product(self.grid) # field fetch + self.plan.preimage_dofs_per_el * given.dofs_per_el() * given.microblock.elements * self.grid[1] * self.plan.parallelism.total() # field store + len(discr.nodes))) else: kernel.prepared_call( self.grid, out_vector.gpudata, prepped_mat, debugbuf.gpudata, len(discr.blocks) * given.microblocks_per_block, ) if set([self.plan.debug_name, "cuda_debugbuf"]) <= discr.debug: copied_debugbuf = debugbuf.get()[:144 * 7].reshape((144, 7)) print "DEBUG" numpy.set_printoptions(linewidth=100) copied_debugbuf.shape = (144, 7) numpy.set_printoptions(threshold=3000) print copied_debugbuf raw_input() return out_vector