Python _evaluateの例、flextensor.measure._evaluate Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_scheduler.py プロジェクト: zhangmingfei2008/FlexTensor

def test():
    # create an empty task but has the correct key we want
    task = Task("yolo1", None, (1, 3, 448, 448, 64, 7, 2, 3, 1, 1), "llvm", 0)
    beg = time.time()
    # s, bufs, configs = schedule(task.key)
    end = time.time()
    # print(tvm.lower(s, bufs, simple_mode=True))
    # print("######################################")
    # print("op schedules:")
    # for config in configs.op_config_lst:
    #     print("----------------------------------")
    #     for name, value in config.items():
    #         if value:
    #             print(name, value)
    # print("graph schedules:")
    # for name, value in configs.graph_config.items():
    #     if value:
    #         print(name, value)
    op_configs = [{
        "spatial": [[1, 1, 1, 1], [1, 1, 1, 3], [454, 1, 1, 1], [1, 227, 2,
                                                                 1]],
        "unroll": [[1500, 1]]
    }, {
        "spatial": [[1, 1, 1, 1], [2, 4, 2, 4], [8, 1, 4, 7], [7, 1, 16, 2]],
        "reduce": [[1, 3, 1], [7, 1, 1], [7, 1, 1]],
        "unroll": [[1500, 1]]
    }]
    graph_config = {"inline": [[0, 0]]}
    configs = Config(op_configs, graph_config)

    s, bufs = schedule_with_config(task.key, configs)
    time_cost = _evaluate(s, bufs, "llvm", 0, 10)
    print("Use", time_cost, "ms")
    print("Cost", end - beg, "s")

コード例 #2

0

ファイルを表示

ファイル: schedule_conv2d_nchw_cuda.py プロジェクト: zhangmingfei2008/FlexTensor

def try_yolo_conv_cuda(batch_size=1):
    # get the compute
    yolo_conv = YoloConvLayer10()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)

    s = tvm.te.create_schedule(outputs.op)
    schedule_yolo_conv_cuda(s, outputs, inputs, weight)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    print(stmt)
    dev_id = 3
    ctx = tvm.nd.context("cuda", dev_id)
    max_dims = ctx.max_thread_dimensions
    kwargs = {
        "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
        "max_threads_per_block": ctx.max_threads_per_block,
        "max_thread_x": max_dims[0],
        "max_thread_y": max_dims[1],
        "max_thread_z": max_dims[2]
    }
    verify = tvm.tir.ir_pass.VerifyGPUCode(stmt, kwargs)
    print(verify)
    time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10)
    print("Yolo conv10 use", time_cost, "ms")

コード例 #3

0

ファイルを表示

ファイル: optimize_bilinear.py プロジェクト: zhangmingfei2008/FlexTensor

def test(task_key, configs, dev_id=None):
    task = TASK_TABLE[task_key]
    s, bufs = schedule_with_config(task_key, configs)
    dev_id = dev_id if dev_id is not None else task.dev_id
    time_cost = _evaluate(s, bufs, task.target, dev_id, 10)
    print(task_key, "use", time_cost, "ms")
    print()

コード例 #4

0

ファイルを表示

ファイル: test_conv2d_cuda_different_schedule.py プロジェクト: Wuqiman/FlexTensor

def try_yolo_conv(config, parameter, fsch):
    # get the compute
    # (1, 3, 448, 448, 64, 3, 7, 7, 1, 2, 3, 1, 1)
    batch, CI, H, W, CO, _, kh, kw, _, st, pad, dilation, group = config
    inputs = tvm.placeholder((batch, CI, H, W), dtype="float32")
    weight = tvm.placeholder((CO, CI, kh, kw), dtype="float32")
    outputs = conv2d_nchw(inputs, weight, stride=st, padding=pad, dilation=dilation, groups=group)
    
    s = tvm.create_schedule(outputs.op)
    fsch(s, outputs, inputs, weight, parameter)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    # print(stmt)
    dev_id = 2
    ctx = tvm.nd.context("cuda", dev_id)
    max_dims = ctx.max_thread_dimensions
    kwargs = {
        "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
        "max_threads_per_block": ctx.max_threads_per_block,
        "max_thread_x": max_dims[0],
        "max_thread_y": max_dims[1],
        "max_thread_z": max_dims[2]
    }
    verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs)
    # print("config is:\n %s" % (str(config)))
    if verify:
        print("Valid kernel")
        time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10)
        print("Yolo conv use", time_cost, "ms\n")
    else:
        print("Invalid kernel")
        time_cost = float("inf")
    return time_cost

コード例 #5

0

ファイルを表示

ファイル: optimize_conv_transpose2d.py プロジェクト: zhangmingfei2008/FlexTensor

def optimize(prefix,
             from_,
             shapes,
             target="llvm",
             dev_id=0,
             trials=100,
             timeout=4.0,
             parallel=1,
             method="searching",
             use_model=False,
             logfile=sys.stdout):
    ret = dict()
    for i, shape in enumerate(shapes):
        print("Optimize {} conv_transpose2d layer {} shape {}".format(
            prefix, i + 1 + from_, shape),
              flush=True)
        batch, in_channel, height, width, out_channel, _, k_h, k_w, _, stride, padding, dilation, groups = shape
        rout_channel = in_channel
        rin_channel = out_channel
        rheight = (height + 2 * padding - dilation *
                   (k_h - 1) - 1) // stride + 1
        rwidth = (width + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
        # create an empty task but has the correct key we want
        task = Task("conv_transpose2d", prefix + str(i + from_), None,
                    (batch, rin_channel, rheight, rwidth, rout_channel, k_h,
                     stride, padding, dilation, groups), target, dev_id)
        beg = time.time()
        s, bufs, configs = schedule(
            task.key,
            op_trial=trials,
            timeout=timeout,
            op_stop=30,
            parallel=parallel,
            method=method,
            use_model=use_model,
            trials=[trials // 10, trials // 10, trials])
        end = time.time()
        # print(tvm.lower(s, bufs, simple_mode=True))
        print("######################################")
        print("op schedules:")
        for config in configs.op_config_lst:
            print("----------------------------------")
            for name, value in config.items():
                if value:
                    print(name, value)
        print("graph schedules:")
        for name, value in configs.graph_config.items():
            if value:
                print(name, value)
        ret[task.key] = configs
        string = json.dumps(configs)
        line = task.key + ":" + string
        print(line, file=logfile, flush=True)
        s, bufs = schedule_with_config(task.key, configs)
        time_cost = _evaluate(s, bufs, target, task.dev_id, 10)
        print("Use", time_cost, "ms")
        print("Cost", end - beg, "s")
        print()
    return ret

コード例 #6

0

ファイルを表示

def optimize(shapes,
             slevel=4,
             rlevel=3,
             target="llvm",
             dev_id=0,
             timeout=4.0,
             trials=100,
             parallel=1,
             method="searching",
             use_model=False,
             logfile=sys.stdout):
    ret = dict()
    for i, shape in enumerate(shapes):
        print("Optimize dilation conv2d shape {}".format(shape), flush=True)
        batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape
        # create an empty task but has the correct key we want
        task = Task("conv2d", "dilation", None,
                    (batch, in_channel, H, W, out_channel, k, stride, padding,
                     dilation, groups), target, dev_id)
        beg = time.time()
        s, bufs, configs = schedule(task.key,
                                    slevel=slevel,
                                    rlevel=rlevel,
                                    op_trial=trials,
                                    timeout=timeout,
                                    op_stop=30,
                                    method=method,
                                    use_model=use_model,
                                    parallel=parallel)
        end = time.time()
        # print(tvm.lower(s, bufs, simple_mode=True))
        print("######################################")
        print("op schedules:")
        for config in configs.op_config_lst:
            print("----------------------------------")
            for name, value in config.items():
                if value:
                    print(name, value)
        print("graph schedules:")
        for name, value in configs.graph_config.items():
            if value:
                print(name, value)
        ret[task.key] = configs
        string = json.dumps(configs)
        line = task.key + ":" + string
        print(line, file=logfile, flush=True)
        s, bufs = schedule_with_config(task.key, configs)
        time_cost = _evaluate(s, bufs, target, task.dev_id, 10)
        print("Use", time_cost, "ms")
        print("Cost", end - beg, "s")
        print()
    return ret

コード例 #7

0

ファイルを表示

def optimize(shapes, slevel=4, rlevel=3, target="llvm", dev_id=0, timeout=4.0, trials=100, parallel=1, 
        method="searching", use_model=False, rpc_info=None, logfile=sys.stdout):
    ret = dict()
    for i, shape in enumerate(shapes):
        print("Optimize gemm shape %s [%.6f]" % (str(shape), time.time()), flush=True)
        N, K, M = shape
        # create an empty task but has the correct key we want
        task = Task(
            "gemm",
            "gemm", 
            None, 
            (N, K, M), 
            target, 
            dev_id
            )
        beg = time.time()
        s, bufs, configs = schedule(
            task.key, 
            slevel=slevel,
            rlevel=rlevel,
            op_trial=trials, 
            timeout=timeout, 
            op_stop=30, 
            method=method, 
            use_model=use_model,
            parallel=parallel,
            rpc_info=rpc_info
            )
        end = time.time()
        # print(tvm.lower(s, bufs, simple_mode=True))
        print("###################################### [%.6f]" % time.time())
        print("op schedules:")
        for config in configs.op_config_lst:
            print("----------------------------------")
            for name, value in config.items():
                if value:
                    print(name, value)
        print("graph schedules:")
        for name, value in configs.graph_config.items():
            if value:
                print(name, value)
        ret[task.key] = configs
        string = json.dumps(configs)
        line = task.key + ":" + string
        print(line, file=logfile, flush=True)
        s, bufs = schedule_with_config(task.key, configs)
        time_cost = _evaluate(s, bufs, target, task.dev_id, 10)
        print("Use", time_cost, "ms")
        print("Cost", end - beg, "s")
        print()
    return ret

コード例 #8

0

ファイルを表示

def try_yolo_conv(batch_size, config):
    global __COUNTER__
    __COUNTER__ += 1
    # get the compute
    yolo_conv = YoloConvLayer17()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)

    s = tvm.create_schedule(outputs.op)
    schedule_yolo_conv_llvm(s, outputs, inputs, weight, config)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    # print(stmt)
    dev_id = 0
    time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, 10)
    print("Yolo conv17 use", time_cost, "ms\n")
    return time_cost

コード例 #9

0

ファイルを表示

def try_yolo_conv(batch_size=1):
    # get the compute
    yolo_conv = YoloConvLayer6()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)
    bias = yolo_conv.get_bias()
    
    s = tvm.create_schedule(outputs.op)
    schedule_yolo_conv_x86(s, outputs, inputs, weight)

    if bias is None:
        arg_bufs = [inputs, weight, outputs]
    else:
        arg_bufs = [inputs, weight, bias, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    print(stmt)
    dev_id = 1
    time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, 100)
    print("Yolo conv6 use", time_cost, "ms")

コード例 #10

0

ファイルを表示

ファイル: test_conv2d_cuda_behavior.py プロジェクト: Wuqiman/FlexTensor

def try_yolo_conv(batch_size, config):
    global __COUNTER__
    __COUNTER__ += 1
    # get the compute
    yolo_conv = YoloConvLayer17()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32")
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)

    s = tvm.create_schedule(outputs.op)
    schedule_yolo_conv_cuda(s, outputs, inputs, weight, config)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    # print(stmt)
    dev_id = 0
    ctx = tvm.nd.context("cuda", dev_id)
    max_dims = ctx.max_thread_dimensions
    kwargs = {
        "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
        "max_threads_per_block": ctx.max_threads_per_block,
        "max_thread_x": max_dims[0],
        "max_thread_y": max_dims[1],
        "max_thread_z": max_dims[2]
    }
    verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs)
    print("%d. config is:\n %s" % (__COUNTER__, str(config)))
    if verify:
        print("Valid kernel")
        time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10)
        print("Yolo conv17 use", time_cost, "ms\n")
    else:
        print("Invalid kernel")
        time_cost = float("inf")
    return time_cost

コード例 #11

0

ファイルを表示

ファイル: schedule_conv2d_nchwc_x86.py プロジェクト: Wuqiman/FlexTensor

             "conv2d_nchwc",
             "yolo_conv6",
             args,
             target,
             dev_id=dev_id)

    for i in [1, 2, 4, 8, 16, 32]:
        ic_factors[0] = i
        ic_factors[1] = 256 // i
        # get compute
        inputs = tvm.placeholder([N, C // vlen // group, H, W, vlen],
                                 dtype="float32")
        weight = tvm.placeholder(
            [K // vlen, C // vlen // group, k, k, vlen, vlen], dtype="float32")
        if use_bias:
            bias = tvm.placeholder([K // vlen, vlen], dtype="float32")
        else:
            bias = None
        output = conv2d_nchwc(inputs,
                              weight,
                              bias,
                              stride=st,
                              padding=pad,
                              dilation=dilation,
                              groups=group)
        s = conv2d_nchwc_schedule_avx2_yolo_conv6(output)
        time_cost = _evaluate(s, [inputs, weight, output], target, dev_id, 10)
        print("Run time: %f ms, throughput: %f GFLOPS" %
              (time_cost,
               N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost))

コード例 #12

0

ファイルを表示

def try_yolo_conv(batch_size=2, number=100):
    # get the compute
    yolo_conv = SqueezeNetFire8Gemm()
    input_shape = yolo_conv.get_intput_shape()
    inputs = tvm.te.placeholder((batch_size, *input_shape),
                                dtype="float32",
                                name='inputs')
    weight = yolo_conv.get_weight()
    outputs = yolo_conv(inputs)
    bias = yolo_conv.get_bias()

    s = tvm.te.create_schedule(outputs.op)

    schedule_yolo_conv_x86(s, outputs, inputs, weight, bias)

    arg_bufs = [inputs, weight, bias, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    print(stmt)
    dev_id = 0
    time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, number=number)
    print("Yolo conv24 use", time_cost, "ms")
    """ For pytorch """
    out_channel, in_channel, kernel_height, kernel_width = yolo_conv.weight_shape
    padding, stride, dilation, groups = (yolo_conv.padding, yolo_conv.stride,
                                         yolo_conv.dilation, yolo_conv.groups)
    conv2d_torch = torch.nn.Conv2d(in_channel,
                                   out_channel, (kernel_height, kernel_width),
                                   padding=padding,
                                   stride=stride,
                                   dilation=dilation,
                                   groups=groups)

    # warm up
    inputs = torch.rand(batch_size, *input_shape)
    res = conv2d_torch(inputs)

    times = time.time()
    for _ in range(number):
        res = conv2d_torch(inputs)
    times = time.time() - times
    print("Pytorch on cpu use: {}ms".format(times / number * 1e3))

    # to test the correctness, currently the result is wrong becasue of the schedule
    # if you change line 148 to 'outer = s[write_cache].fuse(gemm_g, gemm_go)'
    # the result is correct
    ctx = tvm.device("llvm", 0)
    inputs_np = np.random.random(inputs.shape).astype("float32") * 100
    weight_np = np.random.random(to_tuple(weight.shape)).astype(
        weight.dtype) * 100
    outputs_np = np.zeros(shape=to_tuple(outputs.shape), dtype=np.float32)
    bias_np = np.random.random(size=to_tuple(bias.shape)).astype(
        bias.dtype) * 100

    inputs_tvm = tvm.nd.array(inputs_np, ctx)
    weight_tvm = tvm.nd.array(weight_np, ctx)
    outputs_tvm = tvm.nd.array(outputs_np, ctx)
    bias_tvm = tvm.nd.array(bias_np, ctx)

    inputs_torch = torch.tensor(inputs_np)
    weight_torch = torch.tensor(weight_np)
    bias_torch = torch.tensor(bias_np)

    func_tvm = tvm.build(s, arg_bufs, "llvm")
    func_tvm(inputs_tvm, weight_tvm, bias_tvm, outputs_tvm)
    outputs_torch = torch.nn.functional.conv2d(inputs_torch,
                                               weight_torch,
                                               bias=bias_torch,
                                               padding=padding,
                                               stride=stride,
                                               dilation=dilation,
                                               groups=groups)
    the_same = test_allclose(outputs_tvm.asnumpy(),
                             outputs_torch.numpy(),
                             rtol=1e-5,
                             print_diff=True)
    if the_same:
        print("The same!")
    else:
        print("Not the same!")

コード例 #13

0

ファイルを表示

ファイル: tune_conv2d_NCHWc.py プロジェクト: zhangmingfei2008/FlexTensor

                                use_model=use_model,
                                trials=[trials // 10, trials],
                                force_inline=force_inline,
                                rpc_info=rpc_info,
                                slevel=2,
                                rlevel=2)
    end = time.time()

    print("######################################")
    print("op schedules:")
    for config in configs.op_config_lst:
        print("----------------------------------")
        for name, value in config.items():
            if value:
                print(name, value)
    print("graph schedules:")
    for name, value in configs.graph_config.items():
        if value:
            print(name, value)
    string = json.dumps(configs)
    line = task.key + ":" + string
    print(line, file=logfile, flush=True)
    s, bufs = schedule_with_config(task.key, configs)
    time_cost = _evaluate(s, bufs, target, dev_id, 10)
    print(
        "Use", time_cost, "ms", "throughput: %f GFLOPS" %
        (N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost))
    print("Cost", end - beg, "s")

    logfile.close()

コード例 #14

0

ファイルを表示

    roots = set(groups)
    for id in roots:
        tmp = []
        for i in range(g.num_nodes):
            if groups[i] == id:
                tmp.append(i)
        print("Group %d: %s" % (id, str(tmp)))

    # predict
    logits = {}
    logits["spatial"] = [output[9], output[10]]
    logits["reduce"] = [output[11]]
    logits["unroll"] = unroll_output

    config = gemm_config(M, N, K, logits)

    # compute
    A = tvm.placeholder((M, K))
    B = tvm.placeholder((K, N))
    C = gemm(A, B)

    # schedule
    s, bufs = schedule_ops([C.op], [A, B, C], config, target="cuda")

    # build
    func = tvm.build(s, bufs, "cuda")
    print(func.imported_modules[0].get_source())

    # run
    cost = _evaluate(s, bufs, "cuda", 0, 10)
    print("time cost is:", cost)