예제 #1
0
def get_result(desc, poly, attrs=None, profiling=True, need_compare=True):
    backend = _get_backend(desc)

    mod = composite.build(desc, attrs, poly=poly)
    if not need_compare:
        return True
    input_for_mod, expect, output_indexes = gen_json_data(desc)
    output = utils.mod_launch(mod, input_for_mod, output_indexes)
    # In profiling mode, mod_launch will return compute outputs and profiling value, only compute outputs needed here
    if isinstance(output, tuple) and len(output) > 0 and isinstance(
            output[-1], dict):
        output = output[0]
    output = output if isinstance(output, (list, tuple)) else [output]
    expect = expect if isinstance(expect, (list, tuple)) else [expect]
    output = list(output)
    expect = list(expect)
    for i, _ in enumerate(expect):
        if expect[i].dtype == "complex128" or expect[i].dtype == "complex64":
            final_shape = functools.reduce(lambda x, y: x * y, output[i].shape)
            flattern_output = output[i].reshape((final_shape, ))
            output_real = []
            output_imag = []
            for k, _ in enumerate(flattern_output):
                if k % 2 == 0:
                    output_real.append(flattern_output[k])
                else:
                    output_imag.append(flattern_output[k])
            output[i] = np.vectorize(complex)(output_real, output_imag)
            output[i] = output[i].reshape(expect[i].shape)
    if len(output) != len(expect):
        raise RuntimeError(
            "output and expect have different length, {} vs {}".format(
                len(output), len(expect)))

    compare_tolerance = get_compare_tolerance(desc, output_indexes)
    compare_res = list(map(_compare_func, output, expect, compare_tolerance))
    if not all(compare_res):
        source = (mod.imported_modules[0]
                  if backend == "cuda" else mod).get_source()
        logging.debug(source)
        _dump_info(desc, attrs, poly, input_for_mod, output, expect)
        logging.warning("Compare results: %s", str(compare_res))
        return False
    if profiling and backend in ["cuda", "cpu"]:
        ctx = tvm.context(backend, 0)
        has_complex = False
        for i in input_for_mod:
            if i.dtype == "complex64" or i.dtype == "complex128":
                has_complex = True
                break
        if has_complex == False:
            inputs = to_tvm_nd_array(input_for_mod, ctx)
            target_profiling(mod, *inputs, target=backend, repeat_time=1000)
    return True
예제 #2
0
def target_profiling(mod, *args, target="cuda", repeat_time=1, device_id=0, need_warm_up=True):
    """Do profiling on gpu/cpu for op"""
    ctx = tvm.context(target, device_id)
    if target == "llvm":
        ftimer = mod.time_evaluator(mod.entry_name, ctx, number=repeat_time,
                                    repeat=3, min_repeat_ms=1000)
        if need_warm_up:
            mod.time_evaluator(mod.entry_name, ctx, number=1000)(*args)
    else:
        ftimer = mod.time_evaluator(mod.entry_name, ctx, number=repeat_time)
    tcost = ftimer(*args).mean
    print("{}: exec={} ms/op".format(ctx, tcost * 1000))
    return tcost
예제 #3
0
def gpu_schedule_Mul(outs):
    """
    gpu schedule for mul.

    Args:
        outs (tvm.tensor.Tensor): outputs of compute.

    Returns:
        sch (schedule.Schedule): The created schedule.
    """
    device = 'cuda'
    ctx = tvm.context(device, 0)
    if not ctx.exist:
        raise SystemError("Skip because %s is not enabled" % device)
    with tvm.target.create(device):
        sch = topi.cuda.schedule_broadcast(outs)
    return sch
예제 #4
0
def default_schedule(outs):
    """
    default schedule function.

    Args:
        outs (Union[tvm.tensor.Tensor, list[tvm.tensor.Tensor]]): outputs of compute.

    Returns:
        sch (schedule.Schedule): The created schedule.
    """
    if not isinstance(outs, tvm.tensor.Tensor) and not isinstance(outs, list):
        raise ValueError(
            "outs should be list of akg.tvm.tensor.Tensor or akg.tvm.tensor.Tensor"
        )
    device = 'cuda'
    ctx = tvm.context(device, 0)
    if not ctx.exist:
        raise SystemError("Skip because %s is not enabled" % device)
    outs_list = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    with tvm.target.create(device):
        sch = tvm.create_schedule(outs_list[0].op)
        outputs_tensor = Queue()
        outputs_tensor.put(outs_list[0])
        op_list = []
        while not outputs_tensor.empty():
            out = outputs_tensor.get()
            if out.op not in op_list and isinstance(out.op,
                                                    tvm.tensor.ComputeOp):
                op_list.append(out.op)
                for input_tensor in out.op.input_tensors:
                    outputs_tensor.put(input_tensor)
        for op in op_list:
            stage = sch[op.output(0)]
            bx, tx = stage.split(op.axis[0], factor=DEFAULT_GPU_THREAD)
            stage.bind(bx, tvm.thread_axis("blockIdx.x"))
            stage.bind(tx, tvm.thread_axis("threadIdx.x"))
    return sch
예제 #5
0
def gpu_profiling(mod, *args, repeat_time=1, device_id=0):
    """Do profiling on gpu for cuda op"""
    ctx = tvm.context("cuda", device_id)
    ftimer = mod.time_evaluator(mod.entry_name, ctx, number=repeat_time)
    tcost = ftimer(*args).mean
    print("{}: exec={} sec/op".format(ctx, tcost))