def test_fused_relu_grad_bn_update_grad(shape, out_shape, dtype="float16", layout="NHWC", out_dtype="float32", poly_sch=False): shape_list = [out_shape, shape, shape, shape] dtype_list = [out_dtype, dtype, dtype, dtype] op_attrs = [layout] if poly_sch: mod = utils.op_build( fused_relu_grad_bn_update_grad_auto, shape_list, dtype_list, op_attrs=op_attrs, attrs={ "target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_update_grad_manual, shape_list, dtype_list, op_attrs=op_attrs) head, data_sum, in_bn, in_active, output, expect = gen_data(shape, out_shape, dtype, out_dtype, layout) outputs = [output, output] inputs = [data_sum, in_bn, head, in_active] arg_list = inputs + outputs outputs = utils.mod_launch(mod, arg_list, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(outputs, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def test_fused_bn_double_follow_relu(in_shape, in_dtype='float16', layout='NHWC', out_dtype='float16', poly_sch=False): if layout != "NHWC" and layout != "NCHW": raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = 'float32' inputs, output, expect = gen_data(in_shape, in_dtype, inter_dtype, layout, out_dtype) input_shape_list = [i.shape for i in inputs] input_dtype_list = [inter_dtype] * 4 + [in_dtype] + [inter_dtype] * 4 + [in_dtype] op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build(fused_bn_double_follow_relu_auto, input_shape_list, input_dtype_list, op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_bn_double_follow_relu_manual, input_shape_list, input_dtype_list, op_attrs=op_attrs) outputs = [output] arglist = inputs + outputs output = utils.mod_launch(mod, arglist, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_fused_l2loss_grad(shape, layout, fill_data=4e-05, poly_sch=False): data_1 = gen_data(shape, 'float16') data_2 = gen_data(shape, 'float32') expect, output = compute_py(data_1, data_2, layout, fill_data) input_list = [shape, shape] dtype_list = ['float16', 'float32'] op_attrs = [layout, fill_data] if poly_sch: mod = utils.op_build(fused_l2loss_grad_auto, input_list, dtype_list, op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_l2loss_grad_manual, input_list, dtype_list, op_attrs=op_attrs) args = [data_1, data_2, output] output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array([data_1, data_2]) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, expect, 400)
def test_fused_relu_grad(shape, c1=0, poly_sch=False): dtype = 'float16' input = gen_data(shape, dtype) expect = compute_expect(input, c1) shapes = [shape] * 3 dtypes = [dtype] * 3 attrs = [c1] if poly_sch: mod = utils.op_build(fused_relu_grad_auto, shapes, dtypes, op_attrs=attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_manual, shapes, dtypes, op_attrs=attrs) output = np.full(shape, np.nan, dtype) output = utils.mod_launch(mod, (*input, output), expect=expect) res = np.allclose(output, expect, rtol=5e-3, atol=1e-8) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, expect, 400)
def test_fused_pad(shape, pad_before, pad_after, layout='NHWC', pad_value=0.0, poly_sch=False): op_attrs = [pad_before, pad_after, layout, pad_value] if poly_sch: mod = utils.op_build(fused_pad_auto, [shape], ['float32'], op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_pad_manual, [shape], ['float32'], op_attrs=op_attrs) data, output, expect = gen_data(shape, pad_before, pad_after, layout, pad_value) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(data) expect = to_tvm_nd_array(expect) gpu_profiling(mod, data, expect, 400)
def cholesky_run(shape, dtype, attrs): if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build(cholesky.cholesky, [shape], [dtype], kernel_name=kernel_name, attrs=attrs, tuning=t) if t: exp_output, inputs, output = gen_data(dtype, shape) return mod, exp_output, (inputs, output) else: return mod else: # op_attrs=[shape, dtype] mod = utils.op_build(cholesky.cholesky, [shape], [dtype], kernel_name='cholesky', attrs=attrs) exp_output, inputs, output = gen_data(dtype, shape) # result_tvm acu_output = utils.mod_launch(mod, (inputs, output), expect=exp_output) # 4) compare result TestCase_Result = np.allclose(acu_output, exp_output, rtol=5e-03, equal_nan=True) return inputs, acu_output, exp_output, TestCase_Result
def test_fused_mul_div_rsqrt_mul_isfinite_red(shape, dtype='float32', poly_sch=False): input = gen_data(shape, dtype) expect = compute_expect(input) input_shape = [shape, shape] input_dtype = [dtype, dtype] if poly_sch: mod = utils.op_build(fused_mul_div_rsqrt_mul_isfinite_red_auto, input_shape, input_dtype, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_mul_div_rsqrt_mul_isfinite_red_manual, input_shape, input_dtype) outputs = [np.full( (1, ), False, 'bool')] + [np.full(shape, np.nan, dtype)] * 3 output = utils.mod_launch(mod, [*input, *outputs], outputs=list(range(-len(outputs), 0)), expect=expect) ret = compare_tensor(output[0], expect[0], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[1], expect[1], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[2], expect[2], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[3], expect[3], rtol=5e-03, atol=1.e-08) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, *expect, 400)
def test_fused_relu_grad_bn_double_update_grad(shape_f16, shape_f32, layout='NHWC', poly_sch=False): data_1 = gen_data(shape_f32, 'float32') data_2 = gen_data(shape_f16, 'float16') data_3 = gen_data(shape_f32, 'float32') data_4 = gen_data(shape_f16, 'float16') data_5 = gen_data(shape_f16, 'float16') data_6 = gen_data(shape_f16, 'float16') data_7 = gen_data(shape_f16, 'float16') shape_list = [ shape_f32, shape_f16, shape_f32, shape_f16, shape_f16, shape_f16, shape_f16 ] dtype_list = [ 'float32', 'float16', 'float32', 'float16', 'float16', 'float16', 'float16' ] data_list = [data_1, data_2, data_3, data_4, data_5, data_6, data_7] data_tmp7, data_tmp15, data_tmp22, out_shape = compute_py( data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout) expect = [data_tmp7, data_tmp15, data_tmp22] output = np.full(out_shape, np.nan, 'float32') output = [output, output, output] if poly_sch: mod = utils.op_build(fused_relu_grad_bn_double_update_grad_auto, shape_list, dtype_list, op_attrs=[layout], attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_double_update_grad_manual, shape_list, dtype_list, op_attrs=[layout]) output = utils.mod_launch( mod, (data_1, data_2, data_3, data_4, data_5, data_6, data_7, *output), outputs=tuple(range(-len(output), 0)), expect=expect) res = True res &= np.allclose(output[0], expect[0], rtol=5e-03, atol=1e-8) res &= np.allclose(output[1], expect[1], rtol=5e-03, atol=1e-8) res &= np.allclose(output[2], expect[2], rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data_list = to_tvm_nd_array(data_list) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data_list, *expect, 400)
def test_fused_relu_grad_bn_reduce_grad(shape_1, shape_2, layout='NHWC', poly_sch=False): data_1 = gen_data(shape_1, 'float32') data_2 = gen_data(shape_1, 'float32') data_3 = gen_data(shape_1, 'float32') data_4 = gen_data(shape_1, 'float32') data_5 = gen_data(shape_1, 'float32') data_6 = gen_data(shape_1, 'float32') data_7 = gen_data(shape_2, 'float16') data_8 = gen_data(shape_2, 'float16') data_9 = gen_data(shape_2, 'float16') expect, output = compute_py(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout) input_list = [ shape_1, shape_1, shape_1, shape_1, shape_1, shape_1, shape_2, shape_2, shape_2 ] dtype_list = [ 'float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float16', 'float16', 'float16' ] op_attrs = [layout] if poly_sch: mod = utils.op_build(fused_relu_grad_bn_reduce_grad_auto, input_list, dtype_list, op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_reduce_grad_manual, input_list, dtype_list, op_attrs=op_attrs) args = [ data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, output ] output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-08) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array([ data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9 ]) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_ms_select(shape_cond, shape_x, dtype_cond, dtype_x, poly_sch=False): if poly_sch: mod = utils.op_build(select_auto, [shape_cond, shape_x, shape_x], [dtype_cond, dtype_x, dtype_x], attrs={"target": "cuda"}) else: mod = utils.op_build(select_manual, [shape_cond, shape_x, shape_x], [dtype_cond, dtype_x, dtype_x]) expect, cond, x1, x2, output = gen_data(shape_cond, shape_x, dtype_cond, dtype_x) output = utils.mod_launch(mod, (cond, x1, x2, output), expect=expect) res = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail")
def test_ms_resize_grad(shape, size, dtype, align_corners, poly_sch=False): op_attr = [size, align_corners] if poly_sch: mod = utils.op_build(resize_nearest_neighbor_grad_auto, [shape], [dtype], op_attr, attrs={"target": "cuda"}) else: mod = utils.op_build(resize_nearest_neighbor_grad_manual, [shape], [dtype], op_attr) data, output, expect = gen_data(shape, size, align_corners, dtype) output = utils.mod_launch(mod, (data, output), expect=expect) compare_res = compare_tensor(output, expect, rtol=5e-03, atol=1e-08)
def _compilewithjson_cuda(op_func): input_shapes = [] input_types = [] for input_desc in kernel_info['input_desc']: input_shapes.append(input_desc[0]['shape']) input_types.append(input_desc[0]['data_type']) op_attrs = [] if kernel_info['attr']: for ext_arg in kernel_info['attr']: op_attrs.append(ext_arg['value']) dump_ir = os.getenv(get_dump_ir_flag()) == "on" dump_code = os.getenv(get_dump_code_flag()) == "on" kernel_exec.op_build(op_func, input_shapes, input_types, op_attrs, kernel_info['op'], attrs=attrs, dump_ir=dump_ir, dump_code=dump_code) return True
def test_ms_neg(shape, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(neg_auto, [shape], [dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(neg_manual, [shape], [dtype]) data, output, expect = gen_data(shape, dtype) output = utils.mod_launch(mod, (data, output), expect = expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def batch_cholesky_trsm_run(shape1, shape2, dtype, attrs): mod = utils.op_build(batch_cholesky_trsm.batch_cholesky_trsm, [shape1, shape2], [dtype, dtype], kernel_name='batch_cholesky_trsm', attrs=attrs) exp_output, inputs1, inputs2, output = gen_data(dtype, shape1, shape2) #result_tvm acu_output = utils.mod_launch(mod, (inputs1, inputs2, output)) # np.set_printoptions(suppress=True, precision=5) # batch_size = shape1[0] # dim = shape1[1] # for i in range(batch_size): # for j in range(dim): # for k in range(j): # acu_output[i,j,k] = 0 # dim = shape1[1] # acu_output[0,:,:] = np.linalg.solve(acu_output[0,:,:], np.identity(dim)) #acu_output = acu_output[0] print("====") print(inputs1[0, :, :]) print("====") print(acu_output[0, :, :]) print("====") print(exp_output[0, :, :]) TestCase_Result = np.allclose(acu_output, exp_output, rtol=5e-03, equal_nan=True) return inputs1, acu_output, exp_output, TestCase_Result
def triplet_loss_ad_run(shape, dtype, margin=12.0, kernel_name="triplet_loss_grad", attrs={}): support_list = {"float16": np.float16, "float32": np.float32} anchor = np.arange(np.prod(shape)).reshape(shape).astype(dtype) pos = anchor + 0.5 neg = anchor + 2.0 d_pos = np.sum((anchor - pos) * (anchor - pos), -1) d_neg = np.sum((anchor - neg) * (anchor - neg), -1) output_forward = margin + d_pos - d_neg output_forward[output_forward < 0.0] = 0.0 output_forward[output_forward > 0.0] = 1.0 d_pos1 = anchor - pos d_neg1 = anchor - neg assert_res = True output_all = list() expect_all = list() if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) grad = random_gaussian(shape[:-1], miu=1, sigma=0.1).astype(support_list[dtype]) for input_id in range(3): mod = utils.op_build(triplet_loss_ad.triplet_loss_ad, [grad.shape, shape, shape, shape], [dtype, dtype, dtype, dtype], op_attrs=[margin, input_id], kernel_name=kernel_name, attrs=attrs, tuning=t) if t: expect, output = gen_data(d_neg1, d_pos1, dtype, grad, input_id, output_forward) return mod, expect, (grad, anchor, pos, neg, output) else: return mod else: grad = random_gaussian(shape[:-1], miu=1, sigma=0.1).astype(support_list[dtype]) # Testing AD for 3 inputs of the triplet_loss op: # 0 - for "anchor_output" # 1 - for "positive_output" # 2 - for "negative_output" for input_id in range(3): mod = utils.op_build(triplet_loss_ad.triplet_loss_ad, [grad.shape, shape, shape, shape], [dtype, dtype, dtype, dtype], op_attrs=[margin, input_id], kernel_name='triplet_loss_ad', attrs=attrs) expect, output = gen_data(d_neg1, d_pos1, dtype, grad, input_id, output_forward) output = utils.mod_launch(mod, [grad, anchor, pos, neg, output]) assert_res &= compare_tensor(output, expect, rtol=5e-03, atol=5e-2, equal_nan=True) output_all.append(output) expect_all.append(expect) return grad, tuple(output), tuple(expect), assert_res
def test_ms_rsqrt(shape1, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(rsqrt_auto, (shape1,), (dtype,), attrs={"target": "cuda"}) else: mod = utils.op_build(rsqrt_manual, (shape1,), (dtype,)) expect, input1, output = gen_data(dtype, shape1) args = (input1, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") input1, expect = to_tvm_nd_array([input1, expect]) gpu_profiling(mod, input1, expect, 400)
def test_ms_minimum(shape1, shape2, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(minimum_auto, (shape1, shape2), (dtype, dtype), attrs={"target": "cuda"}) else: mod = utils.op_build(minimum_manual, (shape1, shape2), (dtype, dtype)) lhs, rhs, output, expect = gen_data(shape1, shape2, dtype) args = (lhs, rhs, output) output = utils.mod_launch(mod, args, expect=expect) res = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") lhs, rhs, expect = to_tvm_nd_array([lhs, rhs, expect]) gpu_profiling(mod, lhs, rhs, expect, 400)
def test_ms_one_hot(shape, depth, dtype, on_value, off_value, axis, poly_sch=False): if poly_sch: mod = utils.op_build(one_hot_auto, [shape], [dtype], op_attrs=[on_value, off_value, depth, axis, dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(one_hot_manual, [shape], [dtype], op_attrs=[on_value, off_value, depth, axis, dtype]) # gen data expect, data_tmp, on_value_tensor, off_value_tensor, output = gen_data(axis, depth, dtype, shape, on_value, off_value) data = data_tmp.astype(dtype) output = utils.mod_launch(mod, (data, output), expect = expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_fused_is_finite(shape, layout='NHWC', poly_sch=False): if poly_sch: mod = utils.op_build(fused_is_finite_auto, [shape], ['float32'], op_attrs=[layout], attrs={"target": "cuda"}) else: mod = utils.op_build(fused_is_finite_manual, [shape], ['float32'], op_attrs=[layout]) data, expect, output = gen_data(shape, 'float32', layout) args = (data, output) output = utils.mod_launch(mod, args, expect = expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_trans_data(shape, axes, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(trans_data_auto, [shape], [dtype], op_attrs=[axes], attrs={"target": "cuda"}) else: mod = utils.op_build(trans_data_manual, [shape], [dtype], op_attrs=[axes]) data, output, expect = gen_data(shape, axes, dtype) output = utils.mod_launch(mod, (data, output), expect=expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_log(in_shape, in_dtype, poly_sch=False): if poly_sch: mod = utils.op_build(log_auto, (in_shape, ), (in_dtype, ), attrs={"target": "cuda"}) else: mod = utils.op_build(log_manual, (in_shape, ), (in_dtype, )) data, output, expect = gen_data(in_shape, in_dtype) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-7) # from 1e-8 changing to 1e-7 print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_equal(shapes, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(equal_auto, shapes, [dtype, dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(equal_manual, shapes, [dtype, dtype]) inputs1, output1, expect1 = gen_data(shapes, dtype) output1 = utils.mod_launch(mod, (*inputs1, output1), expect=expect1) if shapes[0] == shapes[1]: inputs2 = [] inputs2.append(inputs1[0]) inputs2.append(inputs1[0]) expect2 = np.equal(inputs2[0], inputs2[1]) output2 = np.full(expect2.shape, 0, bool) output2 = utils.mod_launch(mod, (*inputs2, output2), expect=expect1) res = np.allclose(output1, expect1, rtol=5e-03, atol=1.e-8) and np.allclose( output2, expect2, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs1 = to_tvm_nd_array(inputs1) inputs2 = to_tvm_nd_array(inputs2) expect1 = to_tvm_nd_array(expect1) expect2 = to_tvm_nd_array(expect2) gpu_profiling(mod, *inputs1, expect1, *inputs2, expect2, 400) else: res = np.allclose(output1, expect1, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs1 = to_tvm_nd_array(inputs1) expect1 = to_tvm_nd_array(expect1) gpu_profiling(mod, *inputs1, expect1, 400)
def test_ms_divide(shape, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(divide_auto, [shape, shape], [dtype, dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(divide_manual, [shape, shape], [dtype, dtype]) lhs, rhs, output, expect = gen_data(shape, dtype) output = utils.mod_launch(mod, (lhs, rhs, output), expect=expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:==========================") print(mod.imported_modules[0].get_soure()) raise AssertionError("Test fail") lhs, rhs, expect = to_tvm_nd_array([lhs, rhs, expect]) gpu_profiling(mod, lhs, rhs, expect, 400)
def test_ms_addn(shape, dtype, n, poly_sch=False): shapes = [] for i in range(n): shapes.append(shape) if poly_sch: mod = utils.op_build(addn_auto, [shapes], [dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(addn_manual, [shapes], [dtype]) expect, inputs, output = gen_data(shape, shapes, dtype, n) output = utils.mod_launch(mod, (*inputs, output), expect=expect) res = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_ms_reduce_max(in_shape, in_dtype, axis=None, keepdims=False, poly_sch=False): if poly_sch: mod = utils.op_build(reduce_max_auto, (in_shape, ), (in_dtype, ), op_attrs=[axis, keepdims], attrs={"target": "cuda"}) else: mod = utils.op_build(reduce_max_manual, (in_shape, ), (in_dtype, ), op_attrs=[axis, keepdims]) data, output, expect = gen_data(in_shape, in_dtype, axis, keepdims) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_fused_bn_update(shape, dtype, c1=(1 / (256 * 7 * 7)), c2=1.001e-05, c3=1.00007975, c4=0.100000024, poly_sch=False): input = gen_data(shape, dtype) expect = compute_expect(input, c1, c2, c3, c4) attrs = [dtype, c1, c2, c3, c4] shapes = [input[0].shape] * 4 dtypes = [dtype] * 4 if poly_sch: mod = utils.op_build(fused_bn_update_auto, shapes, dtypes, op_attrs=attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_bn_update_manual, shapes, dtypes, op_attrs=attrs) outputs = [np.full(shape, np.nan, dtype)] * 3 attrs_list = input + outputs output = utils.mod_launch(mod, attrs_list, outputs=(range(-len(outputs), 0)), expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, *expect, 400)
def test_ms_bmm(shape1, shape2, dtype, shape_bias=None, poly_sch=False): if poly_sch: mod = utils.op_build(batch_matmul_auto, (shape1, shape2, shape_bias), (dtype, dtype) if shape_bias is None else (dtype, dtype, dtype), attrs={"target": "cuda"}) else: mod = utils.op_build(batch_matmul_manual, (shape1, shape2, shape_bias), (dtype, dtype) if shape_bias is None else (dtype, dtype, dtype)) lhs, rhs, bias, output, expect = gen_data(shape1, shape2, dtype, shape_bias) args = (lhs, rhs, output) if shape_bias is None else (lhs, rhs, bias, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") lhs, rhs, expect = to_tvm_nd_array([lhs, rhs, expect]) gpu_profiling(mod, lhs, rhs, expect, 400)
def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None, idx=None, gen_tiling_spaces=False): """Compile kernel module for conv_bn1""" if index_table is not None: raise RuntimeError('index_table should be none') kernel_name = "conv_bn1_poly" if idx is not None: kernel_name += str(idx) if config is None: attrs = {'dim': ""} else: tile_hh = config.tile_h tile_coco = config.tile_co tile_mm = config.tile_m tile_kk = config.tile_k tile_nn = config.tile_n tile_ww = config.tile_w tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} if op_desc.use_bias: shape = [input_shape[0], input_shape[1], input_shape[2]] else: shape = [input_shape[0], input_shape[1]] conv_dtype = 'float16' return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype], op_attrs=[ op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, op_desc.dilation, op_desc.use_bias, attrs ], kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)
def trace_extract_run(shape, dtype, attrs): """ ops run func. """ mod = utils.op_build(SecondOrder_trace_extract.trace_extract, [shape], [dtype], kernel_name='trace', attrs=attrs) exp_output, inputs, output = gen_data(dtype, shape) #result_tvm acu_output = utils.mod_launch(mod, (inputs, output), expect=exp_output) # 4) compare result print('----result----') print(acu_output) print('----compare---') print(exp_output) TestCase_Result = np.allclose(acu_output, exp_output, rtol=5e-03, equal_nan=True) return inputs, acu_output, exp_output, TestCase_Result
def diag_split_matrix_run(shape, dtype, attrs): """ ops run func. """ dim = shape[0] if (dim // split_dim) > 32: mod = utils.op_build_test(SecondOrder_diag_split_matrix.diag_split_matrix_4608, [shape], [dtype], kernel_name='trace', attrs=attrs) exp_output, inputs, out1, out2 = gen_data1(dtype, shape) acu_output1, acu_output2 = utils.mod_launch(mod, (inputs, out1, out2), (-2, -1), expect=exp_output) print("=====",dim," compare====") print(acu_output1.shape) print(acu_output2.shape) print("=====",dim," compare====") acu_output = np.concatenate((acu_output1, acu_output2), axis = 0 ) TestCase_Result=np.allclose(acu_output, exp_output, rtol=5e-03, equal_nan=True) return inputs,acu_output,exp_output,TestCase_Result elif dim == 576: mod = utils.op_build_test(SecondOrder_diag_split_matrix.diag_split_matrix_576, [shape], [dtype], kernel_name='trace', attrs=attrs) exp_output1, exp_output2, inputs, out1, out2 = gen_data3(dtype, shape) acu_output1, acu_output2 = utils.mod_launch(mod, (inputs, out1, out2), (-2, -1), expect=exp_output1) print("=====",dim," compare====") print(acu_output1.shape) print(acu_output2.shape) print("=====",dim," compare====") # acu_output = np.concatenate((acu_output1, acu_output2), axis = 0 ) TestCase_Result=np.allclose(acu_output1, exp_output1, rtol=5e-03, equal_nan=True) TestCase_Result=np.allclose(acu_output2, exp_output2, rtol=5e-03, equal_nan=True) return inputs,acu_output1,exp_output1,TestCase_Result else: mod = utils.op_build(SecondOrder_diag_split_matrix.diag_split_matrix_small, [shape], [dtype], kernel_name='trace01', attrs=attrs) exp_output, inputs, out1 = gen_data2(dtype, shape) acu_output = utils.mod_launch(mod, (inputs, out1), expect=exp_output) print("=====",dim," compare====") print(acu_output.shape) print("=====",dim," compare====") TestCase_Result=np.allclose(acu_output, exp_output, rtol=5e-03, equal_nan=True) return inputs,acu_output,exp_output,TestCase_Result