예제 #1
0
def gpu_schedule_Mean(outs):
    """
    gpu schedule function for mean.

    Args:
        outs (tvm.tensor.Tensor): outputs of compute.

    Returns:
        sch (schedule.Schedule): The created schedule.
    """
    out = outs[0] if isinstance(outs, list) else outs

    device = "cuda"
    with tvm.target.create(device):
        sch = tvm.create_schedule(out.op)
        if out.op.name == "T_divide":
            tensor_c = out
        else:  # squeeze
            tensor_c = out.op.input_tensors[0]

        tensor_b = tensor_c.op.input_tensors[0]
        if len(tensor_c.op.axis) >= 2:
            sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[1])
        else:
            sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[0])

        bx, tx = sch[tensor_c].split(tensor_c.op.axis[0],
                                     factor=DEFAULT_GPU_THREAD)
        sch[tensor_c].bind(bx, tvm.thread_axis("blockIdx.x"))
        sch[tensor_c].bind(tx, tvm.thread_axis("threadIdx.x"))
    return sch
예제 #2
0
def my_dsl(dtype, kernel_name, attrs):
    m = tvm.var("M")
    n = tvm.var("N")
    A = tvm.placeholder((m, ), name="A", dtype=dtype)
    B = tvm.placeholder((m, ), name="B", dtype=dtype)

    if insn == "add":
        C = topi.add(A, B)
    elif insn == "sub":
        C = topi.subtract(A, B)
    if insn == "mul":
        C = topi.multiply(A, B)
    elif insn == "div":
        C = topi.divide(A, B)
    elif insn == "max":
        C = topi.maximum(A, B)
    elif insn == "min":
        C = topi.minimum(A, B)

    elif insn == "abs":
        C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C')
    elif insn == "exp":
        C = topi.exp(A)
    elif insn == "log":
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)

    elif insn == "adds":
        C = A + tvm.const(2, dtype)
    elif insn == "muls":
        C = A * tvm.const(2, dtype)

    # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule([C.op])
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        if insnType == "binary":
            mod = akg.build(s, [A, B, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        else:
            mod = akg.build(s, [A, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
    return mod
예제 #3
0
def gpu_schedule_MeanGrad(outs):
    """gpu schedule MeanGrad."""
    out = outs[0] if isinstance(outs, list) else outs

    device = "cuda"
    with tvm.target.create(device):
        sch = tvm.create_schedule(out.op)
        tensor_c = out
        tensor_b = tensor_c.op.input_tensors[0]
        if len(tensor_c.op.axis) >= 2:
            sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[1])
        else:
            sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[0])

        bx, tx = sch[tensor_c].split(tensor_c.op.axis[0],
                                     factor=DEFAULT_GPU_THREAD)
        sch[tensor_c].bind(bx, tvm.thread_axis("blockIdx.x"))
        sch[tensor_c].bind(tx, tvm.thread_axis("threadIdx.x"))

    return sch
예제 #4
0
def default_schedule(outs):
    """
    default schedule function.

    Args:
        outs (Union[tvm.tensor.Tensor, list[tvm.tensor.Tensor]]): outputs of compute.

    Returns:
        sch (schedule.Schedule): The created schedule.
    """
    if not isinstance(outs, tvm.tensor.Tensor) and not isinstance(outs, list):
        raise ValueError(
            "outs should be list of akg.tvm.tensor.Tensor or akg.tvm.tensor.Tensor"
        )
    device = 'cuda'
    ctx = tvm.context(device, 0)
    if not ctx.exist:
        raise SystemError("Skip because %s is not enabled" % device)
    outs_list = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    with tvm.target.create(device):
        sch = tvm.create_schedule(outs_list[0].op)
        outputs_tensor = Queue()
        outputs_tensor.put(outs_list[0])
        op_list = []
        while not outputs_tensor.empty():
            out = outputs_tensor.get()
            if out.op not in op_list and isinstance(out.op,
                                                    tvm.tensor.ComputeOp):
                op_list.append(out.op)
                for input_tensor in out.op.input_tensors:
                    outputs_tensor.put(input_tensor)
        for op in op_list:
            stage = sch[op.output(0)]
            bx, tx = stage.split(op.axis[0], factor=DEFAULT_GPU_THREAD)
            stage.bind(bx, tvm.thread_axis("blockIdx.x"))
            stage.bind(tx, tvm.thread_axis("threadIdx.x"))
    return sch