def gpu_schedule_Mean(outs): """ gpu schedule function for mean. Args: outs (tvm.tensor.Tensor): outputs of compute. Returns: sch (schedule.Schedule): The created schedule. """ out = outs[0] if isinstance(outs, list) else outs device = "cuda" with tvm.target.create(device): sch = tvm.create_schedule(out.op) if out.op.name == "T_divide": tensor_c = out else: # squeeze tensor_c = out.op.input_tensors[0] tensor_b = tensor_c.op.input_tensors[0] if len(tensor_c.op.axis) >= 2: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[1]) else: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[0]) bx, tx = sch[tensor_c].split(tensor_c.op.axis[0], factor=DEFAULT_GPU_THREAD) sch[tensor_c].bind(bx, tvm.thread_axis("blockIdx.x")) sch[tensor_c].bind(tx, tvm.thread_axis("threadIdx.x")) return sch
def gpu_schedule_MeanGrad(outs): """gpu schedule MeanGrad.""" out = outs[0] if isinstance(outs, list) else outs device = "cuda" with tvm.target.create(device): sch = tvm.create_schedule(out.op) tensor_c = out tensor_b = tensor_c.op.input_tensors[0] if len(tensor_c.op.axis) >= 2: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[1]) else: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[0]) bx, tx = sch[tensor_c].split(tensor_c.op.axis[0], factor=DEFAULT_GPU_THREAD) sch[tensor_c].bind(bx, tvm.thread_axis("blockIdx.x")) sch[tensor_c].bind(tx, tvm.thread_axis("threadIdx.x")) return sch
def default_schedule(outs): """ default schedule function. Args: outs (Union[tvm.tensor.Tensor, list[tvm.tensor.Tensor]]): outputs of compute. Returns: sch (schedule.Schedule): The created schedule. """ if not isinstance(outs, tvm.tensor.Tensor) and not isinstance(outs, list): raise ValueError( "outs should be list of akg.tvm.tensor.Tensor or akg.tvm.tensor.Tensor" ) device = 'cuda' ctx = tvm.context(device, 0) if not ctx.exist: raise SystemError("Skip because %s is not enabled" % device) outs_list = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs with tvm.target.create(device): sch = tvm.create_schedule(outs_list[0].op) outputs_tensor = Queue() outputs_tensor.put(outs_list[0]) op_list = [] while not outputs_tensor.empty(): out = outputs_tensor.get() if out.op not in op_list and isinstance(out.op, tvm.tensor.ComputeOp): op_list.append(out.op) for input_tensor in out.op.input_tensors: outputs_tensor.put(input_tensor) for op in op_list: stage = sch[op.output(0)] bx, tx = stage.split(op.axis[0], factor=DEFAULT_GPU_THREAD) stage.bind(bx, tvm.thread_axis("blockIdx.x")) stage.bind(tx, tvm.thread_axis("threadIdx.x")) return sch