def get_result(desc, poly, attrs=None, profiling=True, need_compare=True): backend = _get_backend(desc) mod = composite.build(desc, attrs, poly=poly) if not need_compare: return True input_for_mod, expect, output_indexes = gen_json_data(desc) output = utils.mod_launch(mod, input_for_mod, output_indexes) # In profiling mode, mod_launch will return compute outputs and profiling value, only compute outputs needed here if isinstance(output, tuple) and len(output) > 0 and isinstance( output[-1], dict): output = output[0] output = output if isinstance(output, (list, tuple)) else [output] expect = expect if isinstance(expect, (list, tuple)) else [expect] output = list(output) expect = list(expect) for i, _ in enumerate(expect): if expect[i].dtype == "complex128" or expect[i].dtype == "complex64": final_shape = functools.reduce(lambda x, y: x * y, output[i].shape) flattern_output = output[i].reshape((final_shape, )) output_real = [] output_imag = [] for k, _ in enumerate(flattern_output): if k % 2 == 0: output_real.append(flattern_output[k]) else: output_imag.append(flattern_output[k]) output[i] = np.vectorize(complex)(output_real, output_imag) output[i] = output[i].reshape(expect[i].shape) if len(output) != len(expect): raise RuntimeError( "output and expect have different length, {} vs {}".format( len(output), len(expect))) compare_tolerance = get_compare_tolerance(desc, output_indexes) compare_res = list(map(_compare_func, output, expect, compare_tolerance)) if not all(compare_res): source = (mod.imported_modules[0] if backend == "cuda" else mod).get_source() logging.debug(source) _dump_info(desc, attrs, poly, input_for_mod, output, expect) logging.warning("Compare results: %s", str(compare_res)) return False if profiling and backend in ["cuda", "cpu"]: ctx = tvm.context(backend, 0) has_complex = False for i in input_for_mod: if i.dtype == "complex64" or i.dtype == "complex128": has_complex = True break if has_complex == False: inputs = to_tvm_nd_array(input_for_mod, ctx) target_profiling(mod, *inputs, target=backend, repeat_time=1000) return True
def target_profiling(mod, *args, target="cuda", repeat_time=1, device_id=0, need_warm_up=True): """Do profiling on gpu/cpu for op""" ctx = tvm.context(target, device_id) if target == "llvm": ftimer = mod.time_evaluator(mod.entry_name, ctx, number=repeat_time, repeat=3, min_repeat_ms=1000) if need_warm_up: mod.time_evaluator(mod.entry_name, ctx, number=1000)(*args) else: ftimer = mod.time_evaluator(mod.entry_name, ctx, number=repeat_time) tcost = ftimer(*args).mean print("{}: exec={} ms/op".format(ctx, tcost * 1000)) return tcost
def gpu_schedule_Mul(outs): """ gpu schedule for mul. Args: outs (tvm.tensor.Tensor): outputs of compute. Returns: sch (schedule.Schedule): The created schedule. """ device = 'cuda' ctx = tvm.context(device, 0) if not ctx.exist: raise SystemError("Skip because %s is not enabled" % device) with tvm.target.create(device): sch = topi.cuda.schedule_broadcast(outs) return sch
def default_schedule(outs): """ default schedule function. Args: outs (Union[tvm.tensor.Tensor, list[tvm.tensor.Tensor]]): outputs of compute. Returns: sch (schedule.Schedule): The created schedule. """ if not isinstance(outs, tvm.tensor.Tensor) and not isinstance(outs, list): raise ValueError( "outs should be list of akg.tvm.tensor.Tensor or akg.tvm.tensor.Tensor" ) device = 'cuda' ctx = tvm.context(device, 0) if not ctx.exist: raise SystemError("Skip because %s is not enabled" % device) outs_list = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs with tvm.target.create(device): sch = tvm.create_schedule(outs_list[0].op) outputs_tensor = Queue() outputs_tensor.put(outs_list[0]) op_list = [] while not outputs_tensor.empty(): out = outputs_tensor.get() if out.op not in op_list and isinstance(out.op, tvm.tensor.ComputeOp): op_list.append(out.op) for input_tensor in out.op.input_tensors: outputs_tensor.put(input_tensor) for op in op_list: stage = sch[op.output(0)] bx, tx = stage.split(op.axis[0], factor=DEFAULT_GPU_THREAD) stage.bind(bx, tvm.thread_axis("blockIdx.x")) stage.bind(tx, tvm.thread_axis("threadIdx.x")) return sch
def gpu_profiling(mod, *args, repeat_time=1, device_id=0): """Do profiling on gpu for cuda op""" ctx = tvm.context("cuda", device_id) ftimer = mod.time_evaluator(mod.entry_name, ctx, number=repeat_time) tcost = ftimer(*args).mean print("{}: exec={} sec/op".format(ctx, tcost))