def conv2d_nchwc_compute_avx2(N,
                              C,
                              H,
                              W,
                              K,
                              k=3,
                              use_bias=False,
                              st=1,
                              pad=0,
                              dilation=1,
                              group=1):
    vlen = 8  # AVX2 provides 256 bit operations
    inputs = tvm.placeholder([N, C // vlen // group, H, W, vlen],
                             dtype="float32")
    weight = tvm.placeholder([K // vlen, C // vlen // group, k, k, vlen, vlen],
                             dtype="float32")
    if use_bias:
        bias = tvm.placeholder([K // vlen, vlen], dtype="float32")
    else:
        bias = None
    output = conv2d_nchwc(inputs,
                          weight,
                          bias,
                          stride=st,
                          padding=pad,
                          dilation=dilation,
                          groups=group)
    return output
def conv2d_nchwc_compute_avx2(N,
                              C,
                              H,
                              W,
                              K,
                              k=3,
                              use_bias=False,
                              st=1,
                              pad=0,
                              dilation=1,
                              group=1,
                              vlen1=8,
                              vlen2=8):
    inputs = tvm.te.placeholder([N, C // vlen1 // group, H, W, vlen1],
                                dtype="float32")
    weight = tvm.te.placeholder(
        [K // vlen2, C // vlen1 // group, k, k, vlen1, vlen2], dtype="float32")
    if use_bias:
        bias = tvm.te.placeholder([K // vlen2, vlen2], dtype="float32")
    else:
        bias = None
    output = conv2d_nchwc(inputs,
                          weight,
                          bias,
                          stride=st,
                          padding=pad,
                          dilation=dilation,
                          groups=group)
    if use_bias:
        return output, [inputs, weight, bias, output]
    else:
        return [output.op], [inputs, weight, output]
             "conv2d_nchwc",
             "yolo_conv6",
             args,
             target,
             dev_id=dev_id)

    for i in [1, 2, 4, 8, 16, 32]:
        ic_factors[0] = i
        ic_factors[1] = 256 // i
        # get compute
        inputs = tvm.placeholder([N, C // vlen // group, H, W, vlen],
                                 dtype="float32")
        weight = tvm.placeholder(
            [K // vlen, C // vlen // group, k, k, vlen, vlen], dtype="float32")
        if use_bias:
            bias = tvm.placeholder([K // vlen, vlen], dtype="float32")
        else:
            bias = None
        output = conv2d_nchwc(inputs,
                              weight,
                              bias,
                              stride=st,
                              padding=pad,
                              dilation=dilation,
                              groups=group)
        s = conv2d_nchwc_schedule_avx2_yolo_conv6(output)
        time_cost = _evaluate(s, [inputs, weight, output], target, dev_id, 10)
        print("Run time: %f ms, throughput: %f GFLOPS" %
              (time_cost,
               N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost))