def conv2d_nchwc_compute_avx2(N, C, H, W, K, k=3, use_bias=False, st=1, pad=0, dilation=1, group=1): vlen = 8 # AVX2 provides 256 bit operations inputs = tvm.placeholder([N, C // vlen // group, H, W, vlen], dtype="float32") weight = tvm.placeholder([K // vlen, C // vlen // group, k, k, vlen, vlen], dtype="float32") if use_bias: bias = tvm.placeholder([K // vlen, vlen], dtype="float32") else: bias = None output = conv2d_nchwc(inputs, weight, bias, stride=st, padding=pad, dilation=dilation, groups=group) return output
def conv2d_nchwc_compute_avx2(N, C, H, W, K, k=3, use_bias=False, st=1, pad=0, dilation=1, group=1, vlen1=8, vlen2=8): inputs = tvm.te.placeholder([N, C // vlen1 // group, H, W, vlen1], dtype="float32") weight = tvm.te.placeholder( [K // vlen2, C // vlen1 // group, k, k, vlen1, vlen2], dtype="float32") if use_bias: bias = tvm.te.placeholder([K // vlen2, vlen2], dtype="float32") else: bias = None output = conv2d_nchwc(inputs, weight, bias, stride=st, padding=pad, dilation=dilation, groups=group) if use_bias: return output, [inputs, weight, bias, output] else: return [output.op], [inputs, weight, output]
"conv2d_nchwc", "yolo_conv6", args, target, dev_id=dev_id) for i in [1, 2, 4, 8, 16, 32]: ic_factors[0] = i ic_factors[1] = 256 // i # get compute inputs = tvm.placeholder([N, C // vlen // group, H, W, vlen], dtype="float32") weight = tvm.placeholder( [K // vlen, C // vlen // group, k, k, vlen, vlen], dtype="float32") if use_bias: bias = tvm.placeholder([K // vlen, vlen], dtype="float32") else: bias = None output = conv2d_nchwc(inputs, weight, bias, stride=st, padding=pad, dilation=dilation, groups=group) s = conv2d_nchwc_schedule_avx2_yolo_conv6(output) time_cost = _evaluate(s, [inputs, weight, output], target, dev_id, 10) print("Run time: %f ms, throughput: %f GFLOPS" % (time_cost, N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost))