def test(): # create an empty task but has the correct key we want task = Task("yolo1", None, (1, 3, 448, 448, 64, 7, 2, 3, 1, 1), "llvm", 0) beg = time.time() # s, bufs, configs = schedule(task.key) end = time.time() # print(tvm.lower(s, bufs, simple_mode=True)) # print("######################################") # print("op schedules:") # for config in configs.op_config_lst: # print("----------------------------------") # for name, value in config.items(): # if value: # print(name, value) # print("graph schedules:") # for name, value in configs.graph_config.items(): # if value: # print(name, value) op_configs = [{ "spatial": [[1, 1, 1, 1], [1, 1, 1, 3], [454, 1, 1, 1], [1, 227, 2, 1]], "unroll": [[1500, 1]] }, { "spatial": [[1, 1, 1, 1], [2, 4, 2, 4], [8, 1, 4, 7], [7, 1, 16, 2]], "reduce": [[1, 3, 1], [7, 1, 1], [7, 1, 1]], "unroll": [[1500, 1]] }] graph_config = {"inline": [[0, 0]]} configs = Config(op_configs, graph_config) s, bufs = schedule_with_config(task.key, configs) time_cost = _evaluate(s, bufs, "llvm", 0, 10) print("Use", time_cost, "ms") print("Cost", end - beg, "s")
def try_yolo_conv_cuda(batch_size=1): # get the compute yolo_conv = YoloConvLayer10() input_shape = yolo_conv.get_intput_shape() inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) s = tvm.te.create_schedule(outputs.op) schedule_yolo_conv_cuda(s, outputs, inputs, weight) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) print(stmt) dev_id = 3 ctx = tvm.nd.context("cuda", dev_id) max_dims = ctx.max_thread_dimensions kwargs = { "max_shared_memory_per_block": ctx.max_shared_memory_per_block, "max_threads_per_block": ctx.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2] } verify = tvm.tir.ir_pass.VerifyGPUCode(stmt, kwargs) print(verify) time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10) print("Yolo conv10 use", time_cost, "ms")
def test(task_key, configs, dev_id=None): task = TASK_TABLE[task_key] s, bufs = schedule_with_config(task_key, configs) dev_id = dev_id if dev_id is not None else task.dev_id time_cost = _evaluate(s, bufs, task.target, dev_id, 10) print(task_key, "use", time_cost, "ms") print()
def try_yolo_conv(config, parameter, fsch): # get the compute # (1, 3, 448, 448, 64, 3, 7, 7, 1, 2, 3, 1, 1) batch, CI, H, W, CO, _, kh, kw, _, st, pad, dilation, group = config inputs = tvm.placeholder((batch, CI, H, W), dtype="float32") weight = tvm.placeholder((CO, CI, kh, kw), dtype="float32") outputs = conv2d_nchw(inputs, weight, stride=st, padding=pad, dilation=dilation, groups=group) s = tvm.create_schedule(outputs.op) fsch(s, outputs, inputs, weight, parameter) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) # print(stmt) dev_id = 2 ctx = tvm.nd.context("cuda", dev_id) max_dims = ctx.max_thread_dimensions kwargs = { "max_shared_memory_per_block": ctx.max_shared_memory_per_block, "max_threads_per_block": ctx.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2] } verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs) # print("config is:\n %s" % (str(config))) if verify: print("Valid kernel") time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10) print("Yolo conv use", time_cost, "ms\n") else: print("Invalid kernel") time_cost = float("inf") return time_cost
def optimize(prefix, from_, shapes, target="llvm", dev_id=0, trials=100, timeout=4.0, parallel=1, method="searching", use_model=False, logfile=sys.stdout): ret = dict() for i, shape in enumerate(shapes): print("Optimize {} conv_transpose2d layer {} shape {}".format( prefix, i + 1 + from_, shape), flush=True) batch, in_channel, height, width, out_channel, _, k_h, k_w, _, stride, padding, dilation, groups = shape rout_channel = in_channel rin_channel = out_channel rheight = (height + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 rwidth = (width + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 # create an empty task but has the correct key we want task = Task("conv_transpose2d", prefix + str(i + from_), None, (batch, rin_channel, rheight, rwidth, rout_channel, k_h, stride, padding, dilation, groups), target, dev_id) beg = time.time() s, bufs, configs = schedule( task.key, op_trial=trials, timeout=timeout, op_stop=30, parallel=parallel, method=method, use_model=use_model, trials=[trials // 10, trials // 10, trials]) end = time.time() # print(tvm.lower(s, bufs, simple_mode=True)) print("######################################") print("op schedules:") for config in configs.op_config_lst: print("----------------------------------") for name, value in config.items(): if value: print(name, value) print("graph schedules:") for name, value in configs.graph_config.items(): if value: print(name, value) ret[task.key] = configs string = json.dumps(configs) line = task.key + ":" + string print(line, file=logfile, flush=True) s, bufs = schedule_with_config(task.key, configs) time_cost = _evaluate(s, bufs, target, task.dev_id, 10) print("Use", time_cost, "ms") print("Cost", end - beg, "s") print() return ret
def optimize(shapes, slevel=4, rlevel=3, target="llvm", dev_id=0, timeout=4.0, trials=100, parallel=1, method="searching", use_model=False, logfile=sys.stdout): ret = dict() for i, shape in enumerate(shapes): print("Optimize dilation conv2d shape {}".format(shape), flush=True) batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape # create an empty task but has the correct key we want task = Task("conv2d", "dilation", None, (batch, in_channel, H, W, out_channel, k, stride, padding, dilation, groups), target, dev_id) beg = time.time() s, bufs, configs = schedule(task.key, slevel=slevel, rlevel=rlevel, op_trial=trials, timeout=timeout, op_stop=30, method=method, use_model=use_model, parallel=parallel) end = time.time() # print(tvm.lower(s, bufs, simple_mode=True)) print("######################################") print("op schedules:") for config in configs.op_config_lst: print("----------------------------------") for name, value in config.items(): if value: print(name, value) print("graph schedules:") for name, value in configs.graph_config.items(): if value: print(name, value) ret[task.key] = configs string = json.dumps(configs) line = task.key + ":" + string print(line, file=logfile, flush=True) s, bufs = schedule_with_config(task.key, configs) time_cost = _evaluate(s, bufs, target, task.dev_id, 10) print("Use", time_cost, "ms") print("Cost", end - beg, "s") print() return ret
def optimize(shapes, slevel=4, rlevel=3, target="llvm", dev_id=0, timeout=4.0, trials=100, parallel=1, method="searching", use_model=False, rpc_info=None, logfile=sys.stdout): ret = dict() for i, shape in enumerate(shapes): print("Optimize gemm shape %s [%.6f]" % (str(shape), time.time()), flush=True) N, K, M = shape # create an empty task but has the correct key we want task = Task( "gemm", "gemm", None, (N, K, M), target, dev_id ) beg = time.time() s, bufs, configs = schedule( task.key, slevel=slevel, rlevel=rlevel, op_trial=trials, timeout=timeout, op_stop=30, method=method, use_model=use_model, parallel=parallel, rpc_info=rpc_info ) end = time.time() # print(tvm.lower(s, bufs, simple_mode=True)) print("###################################### [%.6f]" % time.time()) print("op schedules:") for config in configs.op_config_lst: print("----------------------------------") for name, value in config.items(): if value: print(name, value) print("graph schedules:") for name, value in configs.graph_config.items(): if value: print(name, value) ret[task.key] = configs string = json.dumps(configs) line = task.key + ":" + string print(line, file=logfile, flush=True) s, bufs = schedule_with_config(task.key, configs) time_cost = _evaluate(s, bufs, target, task.dev_id, 10) print("Use", time_cost, "ms") print("Cost", end - beg, "s") print() return ret
def try_yolo_conv(batch_size, config): global __COUNTER__ __COUNTER__ += 1 # get the compute yolo_conv = YoloConvLayer17() input_shape = yolo_conv.get_intput_shape() inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) s = tvm.create_schedule(outputs.op) schedule_yolo_conv_llvm(s, outputs, inputs, weight, config) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) # print(stmt) dev_id = 0 time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, 10) print("Yolo conv17 use", time_cost, "ms\n") return time_cost
def try_yolo_conv(batch_size=1): # get the compute yolo_conv = YoloConvLayer6() input_shape = yolo_conv.get_intput_shape() inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) bias = yolo_conv.get_bias() s = tvm.create_schedule(outputs.op) schedule_yolo_conv_x86(s, outputs, inputs, weight) if bias is None: arg_bufs = [inputs, weight, outputs] else: arg_bufs = [inputs, weight, bias, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) print(stmt) dev_id = 1 time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, 100) print("Yolo conv6 use", time_cost, "ms")
def try_yolo_conv(batch_size, config): global __COUNTER__ __COUNTER__ += 1 # get the compute yolo_conv = YoloConvLayer17() input_shape = yolo_conv.get_intput_shape() inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) s = tvm.create_schedule(outputs.op) schedule_yolo_conv_cuda(s, outputs, inputs, weight, config) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) # print(stmt) dev_id = 0 ctx = tvm.nd.context("cuda", dev_id) max_dims = ctx.max_thread_dimensions kwargs = { "max_shared_memory_per_block": ctx.max_shared_memory_per_block, "max_threads_per_block": ctx.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2] } verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs) print("%d. config is:\n %s" % (__COUNTER__, str(config))) if verify: print("Valid kernel") time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10) print("Yolo conv17 use", time_cost, "ms\n") else: print("Invalid kernel") time_cost = float("inf") return time_cost
"conv2d_nchwc", "yolo_conv6", args, target, dev_id=dev_id) for i in [1, 2, 4, 8, 16, 32]: ic_factors[0] = i ic_factors[1] = 256 // i # get compute inputs = tvm.placeholder([N, C // vlen // group, H, W, vlen], dtype="float32") weight = tvm.placeholder( [K // vlen, C // vlen // group, k, k, vlen, vlen], dtype="float32") if use_bias: bias = tvm.placeholder([K // vlen, vlen], dtype="float32") else: bias = None output = conv2d_nchwc(inputs, weight, bias, stride=st, padding=pad, dilation=dilation, groups=group) s = conv2d_nchwc_schedule_avx2_yolo_conv6(output) time_cost = _evaluate(s, [inputs, weight, output], target, dev_id, 10) print("Run time: %f ms, throughput: %f GFLOPS" % (time_cost, N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost))
def try_yolo_conv(batch_size=2, number=100): # get the compute yolo_conv = SqueezeNetFire8Gemm() input_shape = yolo_conv.get_intput_shape() inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32", name='inputs') weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) bias = yolo_conv.get_bias() s = tvm.te.create_schedule(outputs.op) schedule_yolo_conv_x86(s, outputs, inputs, weight, bias) arg_bufs = [inputs, weight, bias, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) print(stmt) dev_id = 0 time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, number=number) print("Yolo conv24 use", time_cost, "ms") """ For pytorch """ out_channel, in_channel, kernel_height, kernel_width = yolo_conv.weight_shape padding, stride, dilation, groups = (yolo_conv.padding, yolo_conv.stride, yolo_conv.dilation, yolo_conv.groups) conv2d_torch = torch.nn.Conv2d(in_channel, out_channel, (kernel_height, kernel_width), padding=padding, stride=stride, dilation=dilation, groups=groups) # warm up inputs = torch.rand(batch_size, *input_shape) res = conv2d_torch(inputs) times = time.time() for _ in range(number): res = conv2d_torch(inputs) times = time.time() - times print("Pytorch on cpu use: {}ms".format(times / number * 1e3)) # to test the correctness, currently the result is wrong becasue of the schedule # if you change line 148 to 'outer = s[write_cache].fuse(gemm_g, gemm_go)' # the result is correct ctx = tvm.device("llvm", 0) inputs_np = np.random.random(inputs.shape).astype("float32") * 100 weight_np = np.random.random(to_tuple(weight.shape)).astype( weight.dtype) * 100 outputs_np = np.zeros(shape=to_tuple(outputs.shape), dtype=np.float32) bias_np = np.random.random(size=to_tuple(bias.shape)).astype( bias.dtype) * 100 inputs_tvm = tvm.nd.array(inputs_np, ctx) weight_tvm = tvm.nd.array(weight_np, ctx) outputs_tvm = tvm.nd.array(outputs_np, ctx) bias_tvm = tvm.nd.array(bias_np, ctx) inputs_torch = torch.tensor(inputs_np) weight_torch = torch.tensor(weight_np) bias_torch = torch.tensor(bias_np) func_tvm = tvm.build(s, arg_bufs, "llvm") func_tvm(inputs_tvm, weight_tvm, bias_tvm, outputs_tvm) outputs_torch = torch.nn.functional.conv2d(inputs_torch, weight_torch, bias=bias_torch, padding=padding, stride=stride, dilation=dilation, groups=groups) the_same = test_allclose(outputs_tvm.asnumpy(), outputs_torch.numpy(), rtol=1e-5, print_diff=True) if the_same: print("The same!") else: print("Not the same!")
use_model=use_model, trials=[trials // 10, trials], force_inline=force_inline, rpc_info=rpc_info, slevel=2, rlevel=2) end = time.time() print("######################################") print("op schedules:") for config in configs.op_config_lst: print("----------------------------------") for name, value in config.items(): if value: print(name, value) print("graph schedules:") for name, value in configs.graph_config.items(): if value: print(name, value) string = json.dumps(configs) line = task.key + ":" + string print(line, file=logfile, flush=True) s, bufs = schedule_with_config(task.key, configs) time_cost = _evaluate(s, bufs, target, dev_id, 10) print( "Use", time_cost, "ms", "throughput: %f GFLOPS" % (N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost)) print("Cost", end - beg, "s") logfile.close()
roots = set(groups) for id in roots: tmp = [] for i in range(g.num_nodes): if groups[i] == id: tmp.append(i) print("Group %d: %s" % (id, str(tmp))) # predict logits = {} logits["spatial"] = [output[9], output[10]] logits["reduce"] = [output[11]] logits["unroll"] = unroll_output config = gemm_config(M, N, K, logits) # compute A = tvm.placeholder((M, K)) B = tvm.placeholder((K, N)) C = gemm(A, B) # schedule s, bufs = schedule_ops([C.op], [A, B, C], config, target="cuda") # build func = tvm.build(s, bufs, "cuda") print(func.imported_modules[0].get_source()) # run cost = _evaluate(s, bufs, "cuda", 0, 10) print("time cost is:", cost)