def parallel_sort(keys, values=None): N = keys.shape[0] num_stages = 0 p = 1 while p < N: k = p while k >= 1: invocations = int((N - k - k % p) / (2 * k)) + 1 if values is None: sort_stage(keys, 0, keys, N, p, k, invocations) else: sort_stage(keys, 1, values, N, p, k, invocations) num_stages += 1 sync() k = int(k / 2) p = int(p * 2) print(num_stages)
def run_benchmark(): compile_time = time.time() _func(*args) # compile the kernel first sync() compile_time = time.time() - compile_time stat_write('compilation_time', compile_time) codegen_stat = _ti_core.stat() for line in codegen_stat.split('\n'): try: a, b = line.strip().split(':') except: continue a = a.strip() b = int(float(b)) if a == 'codegen_kernel_statements': stat_write('compiled_inst', b) if a == 'codegen_offloaded_tasks': stat_write('compiled_tasks', b) elif a == 'launched_tasks': stat_write('launched_tasks', b) # Use 3 initial iterations to warm up # instruction/data caches. Discussion: # https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136 for _ in range(3): _func(*args) sync() clear_kernel_profile_info() t = time.time() for _ in range(repeat): _func(*args) sync() elapsed = time.time() - t avg = elapsed / repeat stat_write('wall_clk_t', avg) device_time = kernel_profiler_total_time() avg_device_time = device_time / repeat stat_write('exec_t', avg_device_time)
def func__(*args): assert len(args) == len( self.argument_annotations ), f'{len(self.argument_annotations)} arguments needed but {len(args)} provided' tmps = [] callbacks = [] has_external_arrays = False has_torch = has_pytorch() ndarray_use_torch = impl.get_runtime().ndarray_use_torch actual_argument_slot = 0 launch_ctx = t_kernel.make_launch_context() for i, v in enumerate(args): needed = self.argument_annotations[i] if isinstance(needed, template): continue provided = type(v) # Note: do not use sth like "needed == f32". That would be slow. if id(needed) in primitive_types.real_type_ids: if not isinstance(v, (float, int)): raise TaichiRuntimeTypeError(i, needed.to_string(), provided) launch_ctx.set_arg_float(actual_argument_slot, float(v)) elif id(needed) in primitive_types.integer_type_ids: if not isinstance(v, int): raise TaichiRuntimeTypeError(i, needed.to_string(), provided) launch_ctx.set_arg_int(actual_argument_slot, int(v)) elif isinstance(needed, sparse_matrix_builder): # Pass only the base pointer of the ti.linalg.sparse_matrix_builder() argument launch_ctx.set_arg_int(actual_argument_slot, v._get_addr()) elif isinstance(needed, any_arr) and isinstance( v, taichi.lang._ndarray.Ndarray): has_external_arrays = True v = v.arr if ndarray_use_torch: is_ndarray = True tmp, torch_callbacks = self.get_torch_callbacks( v, has_torch, is_ndarray) callbacks += torch_callbacks launch_ctx.set_arg_external_array_with_shape( actual_argument_slot, int(tmp.data_ptr()), tmp.element_size() * tmp.nelement(), v.shape) else: launch_ctx.set_arg_ndarray(actual_argument_slot, v) elif isinstance(needed, any_arr) and (self.match_ext_arr(v)): has_external_arrays = True is_numpy = isinstance(v, np.ndarray) if is_numpy: tmp = np.ascontiguousarray(v) # Purpose: DO NOT GC |tmp|! tmps.append(tmp) launch_ctx.set_arg_external_array_with_shape( actual_argument_slot, int(tmp.ctypes.data), tmp.nbytes, v.shape) else: is_ndarray = False tmp, torch_callbacks = self.get_torch_callbacks( v, has_torch, is_ndarray) callbacks += torch_callbacks launch_ctx.set_arg_external_array_with_shape( actual_argument_slot, int(tmp.data_ptr()), tmp.element_size() * tmp.nelement(), v.shape) elif isinstance(needed, MatrixType): if id(needed.dtype) in primitive_types.real_type_ids: for a in range(needed.n): for b in range(needed.m): if not isinstance(v[a, b], (int, float)): raise TaichiRuntimeTypeError( i, needed.dtype.to_string(), type(v[a, b])) launch_ctx.set_arg_float( actual_argument_slot, float(v[a, b])) actual_argument_slot += 1 elif id(needed.dtype) in primitive_types.integer_type_ids: for a in range(needed.n): for b in range(needed.m): if not isinstance(v[a, b], int): raise TaichiRuntimeTypeError( i, needed.dtype.to_string(), type(v[a, b])) launch_ctx.set_arg_int(actual_argument_slot, int(v[a, b])) actual_argument_slot += 1 else: raise ValueError( f'Matrix dtype {needed.dtype} is not integer type or real type.' ) continue else: raise ValueError( f'Argument type mismatch. Expecting {needed}, got {type(v)}.' ) actual_argument_slot += 1 # Both the class kernels and the plain-function kernels are unified now. # In both cases, |self.grad| is another Kernel instance that computes the # gradient. For class kernels, args[0] is always the kernel owner. if not self.is_grad and self.runtime.target_tape and not self.runtime.grad_replaced: self.runtime.target_tape.insert(self, args) t_kernel(launch_ctx) ret = None ret_dt = self.return_type has_ret = ret_dt is not None if has_ret or (impl.current_cfg().async_mode and has_external_arrays): runtime_ops.sync() if has_ret: if id(ret_dt) in primitive_types.integer_type_ids: ret = t_kernel.get_ret_int(0) else: ret = t_kernel.get_ret_float(0) if callbacks: for c in callbacks: c() return ret
def func__(*args): assert len(args) == len( self.arguments ), f'{len(self.arguments)} arguments needed but {len(args)} provided' tmps = [] callbacks = [] has_external_arrays = False has_torch = has_pytorch() has_pp = has_paddle() actual_argument_slot = 0 launch_ctx = t_kernel.make_launch_context() for i, v in enumerate(args): needed = self.arguments[i].annotation if isinstance(needed, template): continue provided = type(v) # Note: do not use sth like "needed == f32". That would be slow. if id(needed) in primitive_types.real_type_ids: if not isinstance(v, (float, int)): raise TaichiRuntimeTypeError.get( i, needed.to_string(), provided) launch_ctx.set_arg_float(actual_argument_slot, float(v)) elif id(needed) in primitive_types.integer_type_ids: if not isinstance(v, int): raise TaichiRuntimeTypeError.get( i, needed.to_string(), provided) launch_ctx.set_arg_int(actual_argument_slot, int(v)) elif isinstance(needed, sparse_matrix_builder): # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument launch_ctx.set_arg_int(actual_argument_slot, v._get_addr()) elif isinstance(needed, ndarray_type.NdarrayType) and isinstance( v, taichi.lang._ndarray.Ndarray): has_external_arrays = True v = v.arr launch_ctx.set_arg_ndarray(actual_argument_slot, v) elif isinstance(needed, texture_type.TextureType) and isinstance( v, taichi.lang._texture.Texture): has_external_arrays = True v = v.tex launch_ctx.set_arg_texture(actual_argument_slot, v) elif isinstance(needed, texture_type.RWTextureType) and isinstance( v, taichi.lang._texture.Texture): has_external_arrays = True v = v.tex launch_ctx.set_arg_rw_texture(actual_argument_slot, v) elif isinstance( needed, ndarray_type.NdarrayType) and (self.match_ext_arr(v)): has_external_arrays = True is_numpy = isinstance(v, np.ndarray) is_torch = isinstance(v, torch.Tensor) if has_torch else False # Element shapes are already spcialized in Taichi codegen. # The shape information for element dims are no longer needed. # Therefore we strip the element shapes from the shape vector, # so that it only holds "real" array shapes. is_soa = needed.layout == Layout.SOA array_shape = v.shape element_dim = needed.element_dim if element_dim: array_shape = v.shape[ element_dim:] if is_soa else v.shape[:-element_dim] if is_numpy: tmp = np.ascontiguousarray(v) # Purpose: DO NOT GC |tmp|! tmps.append(tmp) launch_ctx.set_arg_external_array_with_shape( actual_argument_slot, int(tmp.ctypes.data), tmp.nbytes, array_shape) elif is_torch: is_ndarray = False tmp, torch_callbacks = self.get_torch_callbacks( v, has_torch, is_ndarray) callbacks += torch_callbacks launch_ctx.set_arg_external_array_with_shape( actual_argument_slot, int(tmp.data_ptr()), tmp.element_size() * tmp.nelement(), array_shape) else: # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch tmp, paddle_callbacks = self.get_paddle_callbacks( v, has_pp) callbacks += paddle_callbacks launch_ctx.set_arg_external_array_with_shape( actual_argument_slot, int(tmp._ptr()), v.element_size() * v.size, array_shape) elif isinstance(needed, MatrixType): if id(needed.dtype) in primitive_types.real_type_ids: for a in range(needed.n): for b in range(needed.m): if not isinstance(v[a, b], (int, float)): raise TaichiRuntimeTypeError.get( i, needed.dtype.to_string(), type(v[a, b])) launch_ctx.set_arg_float( actual_argument_slot, float(v[a, b])) actual_argument_slot += 1 elif id(needed.dtype) in primitive_types.integer_type_ids: for a in range(needed.n): for b in range(needed.m): if not isinstance(v[a, b], int): raise TaichiRuntimeTypeError.get( i, needed.dtype.to_string(), type(v[a, b])) launch_ctx.set_arg_int(actual_argument_slot, int(v[a, b])) actual_argument_slot += 1 else: raise ValueError( f'Matrix dtype {needed.dtype} is not integer type or real type.' ) continue else: raise ValueError( f'Argument type mismatch. Expecting {needed}, got {type(v)}.' ) actual_argument_slot += 1 # Both the class kernels and the plain-function kernels are unified now. # In both cases, |self.grad| is another Kernel instance that computes the # gradient. For class kernels, args[0] is always the kernel owner. if self.autodiff_mode == AutodiffMode.NONE and self.runtime.target_tape and not self.runtime.grad_replaced: self.runtime.target_tape.insert(self, args) if actual_argument_slot > 8 and ( impl.current_cfg().arch == _ti_core.opengl or impl.current_cfg().arch == _ti_core.cc): raise TaichiRuntimeError( f"The number of elements in kernel arguments is too big! Do not exceed 8 on {_ti_core.arch_name(impl.current_cfg().arch)} backend." ) if actual_argument_slot > 64 and ( (impl.current_cfg().arch != _ti_core.opengl and impl.current_cfg().arch != _ti_core.cc)): raise TaichiRuntimeError( f"The number of elements in kernel arguments is too big! Do not exceed 64 on {_ti_core.arch_name(impl.current_cfg().arch)} backend." ) try: t_kernel(launch_ctx) except Exception as e: e = handle_exception_from_cpp(e) raise e from None ret = None ret_dt = self.return_type has_ret = ret_dt is not None if has_ret: runtime_ops.sync() if has_ret: if id(ret_dt) in primitive_types.integer_type_ids: ret = t_kernel.get_ret_int(0) elif id(ret_dt) in primitive_types.real_type_ids: ret = t_kernel.get_ret_float(0) elif id(ret_dt.dtype) in primitive_types.integer_type_ids: it = iter(t_kernel.get_ret_int_tensor(0)) ret = Matrix([[next(it) for _ in range(ret_dt.m)] for _ in range(ret_dt.n)]) else: it = iter(t_kernel.get_ret_float_tensor(0)) ret = Matrix([[next(it) for _ in range(ret_dt.m)] for _ in range(ret_dt.n)]) if callbacks: for c in callbacks: c() return ret