Example #1
0
def helper(symbol, inputs, dtype,
           np_forward, np_backward=None):
    ishapes = {}
    input_syms = []
    np_inputs = {}
    for (k, v) in inputs.items():
        ishapes.update({k: v[0]})
        np_inputs.update({k: np.random.uniform(size=v[0]).astype(dtype)})
        if len(v) > 1:
            input_syms.append(v[1])

    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes)
        m = graph_runtime.create(graph, lib, ctx)
        m.run(**np_inputs)
        y_np = np_forward(**np_inputs)
        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)

        # backward
        if np_backward:
            graph._set_symbol_list_attr("grad_ys", symbol)
            for x in input_syms:
                graph._set_symbol_list_attr("grad_xs", x)
            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads"))
            graph = graph.apply("Gradient")
            ishapes.update({"head_grads": y_np.shape})
            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
            m = graph_runtime.create(graph, lib, ctx)
            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
            y_np = head_grads * np_backward(**np_inputs)
            m.run(head_grads=head_grads, **np_inputs)
            out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))

            np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
Example #2
0
def test_concatenate_conv2d():
    ch = 3
    size = 8
    data = sym.Variable(name="data")
    concat = sym.concatenate(data, data, axis=1)
    conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv")
    net = sym.elemwise_add(concat, conv)

    dtype="float32"
    dshape = (1, ch, size, size)
    kshape = (ch*2, ch*2, 1, 1)
    oshape = (1, ch*2, size, size)
    shape_dict = {"data": dshape}

    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
        # data, conv weight, conv op, concat
        assert graph.index.num_nodes == 4

        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
        m = graph_runtime.create(graph, lib, ctx)
        m.run(data=data, conv_weight=kernel)
        # get output
        out = m.get_output(0, tvm.nd.empty(oshape, dtype))

        concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1)
        conv = topi.testing.conv2d_nchw_python(
            concat, kernel.asnumpy(), (1,1), 'SAME')
        ref = concat + conv
        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
Example #3
0
def test_multibox_transform_loc():
    batch_size = 1
    num_anchors = 3
    num_classes = 3
    cls_prob = sym.Variable("cls_prob")
    loc_preds = sym.Variable("loc_preds")
    anchors = sym.Variable("anchors")
    transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds,
                                                                 anchor=anchors)
    out = sym.non_max_suppression(data=transform_loc_data, valid_count=valid_count, return_indices=False)

    # Manually create test case
    np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
    np_loc_preds = np.array([[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]])
    np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]])

    expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
                                 [0, 0.44999999, 1, 1, 1, 1],
                                 [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])

    dtype = "float32"
    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
                                                          "loc_preds": (batch_size, num_anchors * 4),
                                                          "anchors": (1, num_anchors, 4)})
        m = graph_runtime.create(graph, lib, ctx)
        m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
        m.run()
        tvm_out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
        tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
Example #4
0
File: onnx.py Project: bddppq/tvm
    def _impl_v1(cls, inputs, attr, params):
        if 'shape' in attr:
            return _op.reshape(inputs[0], attr['shape'])

        if get_name(inputs[1]) in params:
            shape = tuple(params[inputs[1].name_hint].asnumpy())
            out = _op.reshape(inputs[0], shape)
        else:
            # Try to infer shape by precompute prune if possible.
            # TODO: good to check inputs to be in params.
            #       to be enhanced when relay support list_input_names API of NNVM
            logging.warning("Infering Reshape argument by precompute")
            func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
            with tvm.relay.build_config(opt_level=0):
                graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
            ctx = tvm.context("llvm", 0)
            from tvm.contrib import graph_runtime
            m = graph_runtime.create(graph, lib, ctx)
            m.set_input(**params)
            m.run()
            params_new = m.get_output(0)
            inputs.pop(1)
            out = _op.reshape(inputs[0], tuple(params_new.asnumpy().astype('int32').flatten()))

        return out
Example #5
0
def test_avg_pool2d_no_count_pad():
    kh, kw = (4, 4)
    sh, sw = (2, 2)
    ph, pw = (2, 2)

    x = sym.Variable("x")
    y = sym.avg_pool2d(x, pool_size=(kh, kw), strides=(sw, sw), padding=(ph, pw),
                       name="y", count_include_pad=False)
    dtype = "float32"
    n = 1
    (ic, ih, iw) = (3, 28, 28)
    (oc, oh, ow) = (3, 15, 15)

    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
    pad_np[np.ix_(*no_zero)] = a_np
    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)

    for i in range(oh):
        for j in range(ow):
            pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
            b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw],
                                   axis=(2,3)) / np.maximum(pad_count, 1)
    b_np = np.maximum(b_np, 0.0)
    shape_dict = {"x": (n, ic, ih, iw)}
    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
        m = graph_runtime.create(graph, lib, ctx)
        data = tvm.nd.array(a_np)
        m.run(x=data)
        out = m.get_output(0, tvm.nd.empty((n, oc, oh, ow), dtype))
        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
Example #6
0
def test_forward_minimum():
    a = mx.sym.var('a')
    b = mx.sym.var('b')
    dshape = (10, 20)
    dtype = 'float32'
    mx_sym = mx.sym._internal._minimum(a, b)
    np_a = np.random.uniform(size=dshape).astype(dtype)
    np_b = np.random.uniform(size=dshape).astype(dtype)
    mx_a = mx.nd.array(np_a)
    mx_b = mx.nd.array(np_b)
    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
    mod.init_params()
    args, auxs = mod.get_params()
    mx_out = mx.nd._internal._minimum(mx_a, mx_b).asnumpy()
    out_shape = dshape
    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
    shape_dict = {'a': dshape, 'b': dshape}
    for target, ctx in ctx_list():
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
        m = graph_runtime.create(graph, lib, ctx)
        # set inputs
        m.set_input("a", tvm.nd.array(np_a))
        m.set_input("b", tvm.nd.array(np_b))
        m.set_input(**params)
        m.run()
        # get outputs
        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
Example #7
0
def test_conv_ewise_injective():
    x = sym.Variable("x")
    y = sym.conv2d(x, channels=32, kernel_size=(3, 3), groups=32,
                   name="y", padding=(1,1))
    y = sym.flatten(y + 1) + 1
    dtype = "float32"
    dshape = (1, 32, 18, 18)
    kshape = (32, 1, 3, 3)
    oshape = (1, 32* 18 * 18)
    shape_dict = {"x": dshape}

    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
        m = graph_runtime.create(graph, lib, ctx)
        # print(graph.ir(join_entry_attrs=["shape"]))
        assert graph.index.num_nodes == 5
        # set input
        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
        m.run(x=data, y_weight=kernel, y_bias=bias)
        # get output
        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
        c_np = topi.testing.depthwise_conv2d_python_nchw(
            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) + 1
        c_np = c_np.reshape(c_np.shape[0], np.prod(c_np.shape[1:])) + 1
        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
Example #8
0
def test_non_max_suppression():
    dshape = (1, 5, 6)
    data = sym.Variable("data")
    valid_count = sym.Variable("valid_count", dtype="int32")
    iou_threshold = 0.7
    force_suppress = True
    top_k = 2
    out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False,
                                  iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k)

    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
    np_valid_count = np.array([4]).astype("int32")
    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                           [-1, -1, -1, -1, -1, -1]]])

    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
                                            dtype={"data": "float32", "valid_count": "int32"})
        m = graph_runtime.create(graph, lib, ctx)
        m.set_input(**{"data": np_data, "valid_count": np_valid_count})
        m.run()
        tvm_out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
Example #9
0
def test_injective_conv2d():
    channels = 16
    data = sym.Variable(name="data")
    pool = sym.global_avg_pool2d(data=data)
    weight = sym.reshape(pool, shape=[1, channels, 1, 1])
    residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
                          layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv")
    net = weight * data + residual
    size = 56
    dtype="float32"
    dshape = (1, channels, size, size)
    kshape = (channels, channels, 3, 3)
    oshape = dshape
    shape_dict = {"data": dshape}

    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
        # data, global_avg_pool, conv weight, conv op, fused elemwise add
        assert graph.index.num_nodes == 5

        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
        m = graph_runtime.create(graph, lib, ctx)
        m.run(data=data, conv_weight=kernel)
        # get output
        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
        residual = topi.testing.conv2d_nchw_python(
            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
        weight = np.mean(data.asnumpy(), axis=(2, 3))
        c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual
        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 def check_verify():
     mod = graph_runtime.create(graph, mhost, ctx)
     mod.set_input(**params)
     mod.run()
     out = mod.get_output(0, tvm.nd.empty(shape))
     np.testing.assert_equal(
         out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
Example #11
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, data_shape, out_shape = get_network(model_name, batch_size)
    tasks = autotvm.task.extract_from_graph(net, target=target,
                                            shape={'data': data_shape}, dtype=dtype,
                                            symbols=(nnvm.sym.conv2d,))

    # run tuning tasks
    print("Tuning...")
    tune_kernels(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net, target=target, shape={'data': data_shape}, params=params, dtype=dtype)

        # upload parameters to device
        ctx = tvm.cpu()
        data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
        module = runtime.create(graph, lib, ctx)
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Example #12
0
def run(args):
    onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx'))
    symbol, params = nnvm.frontend.from_onnx(onnx_model)
    input_names = symbol.list_input_names()
    output_names = symbol.list_output_names()

    test_data_dir = os.path.join(args.test_dir, 'test_data_set_0')
    inputs, outputs = load_test_data(test_data_dir, input_names, output_names)
    inputs = dict(inputs)

    # assert len(input_names) == len(inputs) + len(params)
    # assert len(output_names) == len(outputs)

    graph, lib, params = compile(
        symbol, args.target, input_names, inputs, params,
        args.opt_level, args.autotvm_log)

    if args.dump_nnvm:
        print(graph.ir())
        print(graph.json())

    ctx = tvm.gpu()

    # Prepare inputs.
    tvm_inputs = {}
    for name, value in inputs.items():
        tvm_inputs[name] = tvm.nd.array(value, ctx=ctx)
    for name, value in params.items():
        tvm_inputs[name] = tvm.nd.array(value, ctx=ctx)

    graph_module = None
    if args.debug:
        try:
            graph_module = debug_runtime.create(graph, lib, ctx)
        except:
            print('debug_runtime is disabled. '
                  'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM')
    if graph_module is None:
        graph_module = graph_runtime.create(graph, lib, ctx)

    graph_module.set_input(**tvm_inputs)

    graph_module.run()

    for i, (name, expected) in enumerate(outputs):
        tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx)
        actual = graph_module.get_output(i, tvm_output).asnumpy()
        np.testing.assert_allclose(expected, actual,
                                   rtol=1e-3, atol=1e-4), name
        print('%s: OK' % name)
    print('ALL OK')

    if args.iterations > 1:
        num_iterations = args.iterations - 1
        start = time.time()
        for t in range(num_iterations):
            graph_module.run()
            cupy.cuda.device.Device().synchronize()
        elapsed = time.time() - start
        print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
Example #13
0
def test_nms():
    dshape = (1, 5, 6)
    data = sym.Variable("data")
    valid_count = sym.Variable("valid_count", dtype="int32")
    nms_threshold = 0.7
    force_suppress = True
    nms_topk = 2
    out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold,
                  force_suppress=force_suppress, nms_topk=nms_topk)

    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
    np_valid_count = np.array([4]).astype("int32")
    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
                           [-1, -1, -1, -1, -1, -1]]])

    target = "llvm"
    ctx = tvm.cpu()
    graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
                                        dtype={"data": "float32", "valid_count": "int32"})
    m = graph_runtime.create(graph, lib, ctx)
    m.set_input(**{"data": np_data, "valid_count": np_valid_count})
    m.run()
    out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
    tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
Example #14
0
def test_gru_like():
    def unit(rnn_dim):
        X = relay.var("X", shape=(1, rnn_dim))
        W = relay.var("y", shape=(3 * rnn_dim, rnn_dim))
        matmul = relay.nn.dense(X, W)
        splitted = relay.split(matmul, indices_or_sections=3, axis=1)
        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
        return relay.Function([X, W], out)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def unit_numpy(X, W):
        prod = np.dot(X, W.transpose())
        splits = np.split(prod, indices_or_sections=3, axis=1)
        return sigmoid(splits[0]) + np.tanh(splits[1]) * np.exp(splits[2])

    dtype = "float32"
    rnn_dim = 1000
    x = np.random.rand(1, rnn_dim).astype(dtype)
    y = np.random.rand(3*rnn_dim, rnn_dim).astype(dtype) * 0.01 - 0.005
    out_shape = (1, rnn_dim)
    z = unit(rnn_dim)

    for target, ctx in ctx_list():
        with relay.build_config(opt_level=2):
            graph, lib, params = relay.build(z, target)
            m = graph_runtime.create(graph, lib, ctx)
            m.set_input("X", tvm.nd.array(x.astype(dtype)))
            m.set_input("y", tvm.nd.array(y.astype(dtype)))
            m.set_input(**params)
            m.run()
            out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
            ref = unit_numpy(x, y)
            tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
Example #15
0
def test_mixed_precision():
    x = sym.Variable("x")
    dtype = "int8"
    out_dtype="int32"
    y = sym.conv2d(x,
                   channels=10,
                   kernel_size=(3,3),
                   name="y",
                   padding=(1,1),
                   use_bias=False,
                   out_dtype="int32")
    dshape = (1, 3, 18, 18)
    kshape = (10, 3, 3, 3)
    oshape = (1, 10, 18, 18)
    shape_dict = {"x": dshape}
    dtype_dict = {"x": dtype}
    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
        m = graph_runtime.create(graph, lib, ctx)
        data = tvm.nd.array(np.random.uniform(-127, 127, size=dshape).astype(dtype))
        kernel = tvm.nd.array(np.random.uniform(-127, 127, size=kshape).astype(dtype))
        m.run(x=data, y_weight=kernel)
        out = m.get_output(0, tvm.nd.empty(oshape, out_dtype))
        c_np = topi.testing.conv2d_nchw_python(
            data.asnumpy().astype(out_dtype),
            kernel.asnumpy().astype(out_dtype), 1, 1)
        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
Example #16
0
def test_forward_where():
    cond = mx.sym.var('cond')
    x = mx.sym.var('x')
    y = mx.sym.var('y')
    dshape = (2, 2)
    dtype = 'float32'
    mx_sym = mx.sym.where(cond, x, y)
    np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype)
    np_x = np.random.uniform(size=dshape).astype(dtype)
    np_y = np.random.uniform(size=dshape).astype(dtype)
    mx_cond = mx.nd.array(np_cond)
    mx_x = mx.nd.array(np_x)
    mx_y = mx.nd.array(np_y)
    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
    mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False)
    mod.init_params()
    args, auxs = mod.get_params()
    mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
    out_shape = dshape
    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
    shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape}
    for target, ctx in ctx_list():
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
        m = graph_runtime.create(graph, lib, ctx)
        # set inputs
        m.set_input("cond", tvm.nd.array(np_cond))
        m.set_input("x", tvm.nd.array(np_x))
        m.set_input("y", tvm.nd.array(np_y))
        m.set_input(**params)
        m.run()
        # get outputs
        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
Example #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'],
        help="The model type.")
    parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.")
    parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.")
    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
    parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.")
    args = parser.parse_args()

    opt_level = args.opt_level

    num_iter = args.num_iter
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size,) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(
            batch_size=1, image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))


    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(
            net, tvm.target.rasp(), shape={"data": data_shape}, params=params)

    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.o')
    lib.save(lib_fname)

    remote = rpc.connect(args.host, args.port)
    remote.upload(lib_fname)

    ctx = remote.cpu(0)
    rlib = remote.load_module('net.o')
    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

    module = runtime.create(graph, rlib, ctx)
    module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32")))
    module.set_input(**rparams)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(3):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding cpu overheat
        time.sleep(45)
Example #18
0
def graph_to_function(graph, target, ctx, shape=None, dtype=None):
    """Convert a graph to a function taking a keyword args and returning a list of results
    (both args and results are numpy arrays).

    Example::

        fun = graph_to_function(graph, llvm, cpu(0))
        [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,)))

    Parameters
    ----------
    graph : nnvm.graph.Graph
        A graph we want to convert to a function.

    target : str or :any:`tvm.target.Target`
        The build target

    ctx : TVMContext
        The context to deploy the module.

    shape : Dict[str, Tuple[int]], optional
        A dict mapping input variable names to shapes.
        By default shapes will be inferred from variables' attributes.
        Note that this parameter takes precedence over variables' attributes.

    dtype : Dict[str, str] or str, optional
        A dict mapping input variable names to dtypes, or just a single dtype.
        By default dtypes will be inferred from variables' attributes.
        Note that this parameter takes precedence over variables' attributes.

    Returns
    -------
    function : Callable[..., List[numpy.ndarray]]
    """
    # Infer missing shapes and dtypes
    graph, shape, dtype, output_shapes, output_dtypes = \
        infer_shapes_dtypes(graph, shape=shape, dtype=dtype)

    if None in dtype.values():
        raise ValueError("Input variables with no type: {}".format(dtype))

    if not all(shape.values()):
        raise ValueError("Input variables with no shape: {}".format(shape))

    compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype)
    module = graph_runtime.create(compute_graph, lib, ctx)

    if params:
        module.set_inputs(**params)

    def run(**kwargs):
        module.run(**kwargs)
        res = []
        for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)):
            res.append(module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy())
        return res

    return run
Example #19
0
def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
    module = graph_runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out =  module.get_output(0, tvm.nd.empty(out_shape))
    return out.asnumpy(), graph
Example #20
0
def build_and_run(sym, params, data, out_shape):
    ctx = tvm.cpu(0)
    graph, lib, params = nnvm.compiler.build(sym, "llvm", shape={"data":data.shape}, params=params)
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out =  module.get_output(0, tvm.nd.empty(out_shape))
    return out.asnumpy()
Example #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--target', type=str, required=True,
                        choices=['cuda', 'rocm', 'opencl', 'metal'],
                        help="Compilation target.")
    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
    parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
    args = parser.parse_args()
    opt_level = args.opt_level
    num_iter = args.num_iter
    ctx = tvm.context(args.target, 0)
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size,) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(
            batch_size=1, image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    if args.target == "cuda":
        unroll = 1400
    else:
        unroll = 128
    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.build_config(auto_unroll_max_step=unroll,
                              unroll_explicit=(args.target != "cuda")):
            graph, lib, params = nnvm.compiler.build(
                net, args.target, shape={"data": data_shape}, params=params)

    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(args.repeat):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding device overheat
        if i + 1 != args.repeat:
            time.sleep(45)
Example #22
0
 def check_verify():
     if not tvm.module.enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
     mlib = tvm.build(s, [A, B], "llvm", name="myadd")
     mod = graph_runtime.create(graph, mlib, tvm.cpu(0))
     a = np.random.uniform(size=(n,)).astype(A.dtype)
     mod.run(x=a)
     out = mod.get_output(0, tvm.nd.empty((n,)))
     np.testing.assert_equal(out.asnumpy(), a + 1)
Example #23
0
 def verify(graph, lib):
     m = graph_runtime.create(graph, lib, tvm.cpu(0))
     # get member functions
     na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
     nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
     m.run(x=na, y=nb)
     # get outputs
     out = m.get_output(0, tvm.nd.empty(shape, dtype))
     tvm.testing.assert_allclose(
         out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
Example #24
0
 def get_tvm_output(xs, target, ctx, dtype='float32'):
     shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
     func, params = relay.frontend.from_keras(keras_model, shape_dict)
     with relay.build_module.build_config(opt_level=2):
         graph, lib, params = relay.build(func, target, params=params)
     m = graph_runtime.create(graph, lib, ctx)
     for name, x in zip(keras_model.input_names, xs):
         m.set_input(name, tvm.nd.array(x.astype(dtype)))
     m.set_input(**params)
     m.run()
     return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
Example #25
0
def test_num_outputs():
    x = sym.Variable('x')
    z = sym.split(x, indices_or_sections=5, axis=1)
    shape = (10, 10)
    dtype = tvm.float32
    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
    params = {"x": nx}
    graph, lib, params = nnvm.compiler.build(
        z, "llvm", shape={"x": nx.shape}, params=params)
    m = graph_runtime.create(graph, lib, tvm.cpu(0))
    assert m.get_num_outputs() == 5
Example #26
0
def get_tvm_output(func, x, params, target, ctx,
                   out_shape=(1, 1000), input_name='image', dtype='float32'):
    with relay.build_module.build_config(opt_level=3):
        graph, lib, params = relay.build(func, target, params=params)
    m = graph_runtime.create(graph, lib, ctx)
    # set inputs
    m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
    m.set_input(**params)
    m.run()
    # get outputs
    out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
    return out.asnumpy()
Example #27
0
 def run_test_conv2d(sym, dtype, dshape, kshape, oshape, shape_dict, padding):
     for target, ctx in ctx_list():
         graph, lib, _ = nnvm.compiler.build(sym, target, shape_dict)
         m = graph_runtime.create(graph, lib, ctx)
         data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
         kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
         bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
         m.run(x=data, y_weight=kernel, y_bias=bias)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         c_np = topi.testing.conv2d_nchw_python(
             data.asnumpy(), kernel.asnumpy(), 1, padding)
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
         tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
Example #28
0
def verify_reduce(dshape, fnp, fsym, **kwargs):
    x = sym.Variable("x")
    y = fsym(x + 1, **kwargs)
    dtype = "float32"
    for target, ctx in ctx_list():
        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
        m = graph_runtime.create(graph, lib, ctx)
        # set input
        data = np.random.uniform(size=dshape).astype(dtype)
        out_np = fnp(data + 1, **kwargs)
        m.run(x=data)
        out = m.get_output(0, tvm.nd.empty(out_np.shape))
        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
Example #29
0
def get_tvm_output(symbol, x, params, target, ctx,
                   out_shape=(1, 1000), input_name='image', dtype='float32'):
    shape_dict = {input_name : x.shape}
    with nnvm.compiler.build_config(opt_level=2):
        graph, lib, params = nnvm.compiler.build(symbol, target, shape_dict, params=params)
    m = graph_runtime.create(graph, lib, ctx)
    # set inputs
    m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
    m.set_input(**params)
    m.run()
    # get outputs
    out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
    return out.asnumpy()
 def check_load_module():
     temp = util.tempdir()
     path_lib = temp.relpath("deploy.so")
     mhost.export_library(path_lib)
     with open(temp.relpath("deploy.json"), "w") as out_file:
         out_file.write(graph)
     loaded_lib = tvm.module.load(path_lib)
     loaded_graph = open(temp.relpath("deploy.json")).read()
     mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
     mod.set_input(**params)
     mod.run()
     out = mod.get_output(0, tvm.nd.empty(shape))
     np.testing.assert_equal(
         out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
def tune_and_evaluate(tuning_opt):

    if env.TARGET != "sim":
        # Get remote from fleet node
        remote = autotvm.measure.request_remote(env.TARGET,
                                                tracker_host,
                                                tracker_port,
                                                timeout=10000)
        # Reconfigure the JIT runtime and FPGA.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
    else:
        # In simulation mode, host the RPC server locally.
        remote = rpc.LocalSession()

    # Register VTA tuning tasks
    register_vta_tuning_tasks()

    # Perform task extraction on Relay program
    print("Extract tasks...")
    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
    mod = tvm.IRModule.from_expr(relay_prog)
    tasks = autotvm.task.extract_from_program(mod,
                                              params=params,
                                              ops=(relay.op.get("nn.conv2d"),),
                                              target=target,
                                              target_host=env.target_host)

    # filter out non-packed conv2d task
    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))

    # We should have extracted 10 convolution tasks
    assert len(tasks) == 10
    print("Extracted {} conv2d tasks:".format(len(tasks)))
    for tsk in tasks:
        inp = tsk.args[0][1]
        wgt = tsk.args[1][1]
        batch = inp[0] * inp[4]
        in_filter = inp[1] * inp[5]
        out_filter = wgt[0] * wgt[4]
        height, width = inp[2], inp[3]
        hkernel, wkernel = wgt[2], wgt[3]
        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
        print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
            batch, height, width, in_filter, out_filter, hkernel, wkernel,
            hpad, wpad, hstride, wstride))

    # We do not run the tuning in our webpage server since it takes too long.
    # Comment the following line to run it by yourself.
    return

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[log_file]):
        # Compile network
        print("Compile...")
        if target.device_name != "vta":
            with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(relay_prog,
                                                target=target,
                                                params=params,
                                                target_host=env.target_host)
        else:
            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog,
                    target=target,
                    params=params,
                    target_host=env.target_host)

        # Export library
        print("Upload...")
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Generate the graph runtime
        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
        m = graph_runtime.create(graph, lib, ctx)

        # upload parameters to device
        image = tvm.nd.array(
            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
        m.set_input(**params)
        m.set_input('data', image)

        # evaluate
        print("Evaluate inference time cost...")
        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
        tcost = timer()
        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Example #32
0
def test_full():
    shape = (3, 4, 5)
    value = 7
    dtype = "float32"
    for target, ctx in ctx_list():
        data = sym.Variable("data", dtype=dtype)
        # full_like
        s = sym.full_like(data=data, fill_value=value, name="s")
        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
        m = graph_runtime.create(graph, lib, ctx)
        m.run(data=np.random.uniform(size=shape).astype(dtype))
        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
        np.testing.assert_allclose(out.asnumpy(),
                                   np.full(shape,
                                           fill_value=value,
                                           dtype=dtype),
                                   atol=1e-5,
                                   rtol=1e-5)
        # ones_like
        s = sym.ones_like(data=data, fill_value=value, name="s")
        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
        m = graph_runtime.create(graph, lib, ctx)
        m.run(data=np.random.uniform(size=shape).astype(dtype))
        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
        np.testing.assert_allclose(out.asnumpy(),
                                   np.full(shape, fill_value=1, dtype=dtype),
                                   atol=1e-5,
                                   rtol=1e-5)
        # zeros_like
        s = sym.zeros_like(data=data, fill_value=value, name="s")
        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
        m = graph_runtime.create(graph, lib, ctx)
        m.run(data=np.random.uniform(size=shape).astype(dtype))
        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
        np.testing.assert_allclose(out.asnumpy(),
                                   np.full(shape, fill_value=0, dtype=dtype),
                                   atol=1e-5,
                                   rtol=1e-5)
        # full
        s = sym.full(shape=shape, dtype=dtype, fill_value=value, name="s")
        graph, lib, _ = nnvm.compiler.build(s, target)
        m = graph_runtime.create(graph, lib, ctx)
        m.run()
        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
        np.testing.assert_allclose(out.asnumpy(),
                                   np.full(shape,
                                           fill_value=value,
                                           dtype=dtype),
                                   atol=1e-5,
                                   rtol=1e-5)
        # ones
        s = sym.ones(shape=shape, dtype=dtype, name="s")
        graph, lib, _ = nnvm.compiler.build(s, target)
        m = graph_runtime.create(graph, lib, ctx)
        m.run()
        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
        np.testing.assert_allclose(out.asnumpy(),
                                   np.full(shape, fill_value=1, dtype=dtype),
                                   atol=1e-5,
                                   rtol=1e-5)
        # zeros
        s = sym.zeros(shape=shape, dtype=dtype, name="s")
        graph, lib, _ = nnvm.compiler.build(s, target)
        m = graph_runtime.create(graph, lib, ctx)
        m.run()
        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
        np.testing.assert_allclose(out.asnumpy(),
                                   np.full(shape, fill_value=0, dtype=dtype),
                                   atol=1e-5,
                                   rtol=1e-5)
Example #33
0
cpudevice = tvm.runtime.cpu()
ctx = tvm.runtime.context("cpu")

with tvm.transform.PassContext(opt_level=3):
    graph_mod = relay.build(mod,
                            tvm_targets,
                            params=params,
                            target_host=target_host)

lib = graph_mod.get_lib()
params = graph_mod.get_params()
graph = graph_mod.get_json()

# Create a runtime executor module
module = graph_runtime.create(graph, lib, tvm.cpu())

# Feed input data
module.set_input(input_tensor, tvm.nd.array(image_data))

# Feed related params
module.set_input(**params)

ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
prof_res = np.array(
    ftimer().results) * 1000  # multiply 1000 for converting to millisecond
print("%-20s %-7s %-19s (%s)" %
      (model_name, device, "%.2f ms" % np.mean(prof_res),
       "%.2f ms" % np.std(prof_res)))
print(tvm_target)
def main():
    # one line to get the model
    block = get_model('resnet18_v1', pretrained=True)
    # test model
    img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
    img_name = 'cat.png'
    img_path = download_testdata(img_url, img_name, module='data')
    image = Image.open(img_path).resize((224, 224))
    # tvm specific data path
    # print(img_path)

    x = transform_image(image)

    # label number to word dict prepped with synset
    synset_url = ''.join([
        'https://gist.githubusercontent.com/zhreshold/',
        '4d0b62f3d01426887599d4f7ede23ee5/raw/',
        '596b27d23537e5a1b5751d2b0481ef172f58b539/',
        'imagenet1000_clsid_to_human.txt'
    ])
    synset_name = 'imagenet1000_clsid_to_human.txt'
    synset_path = download_testdata(synset_url, synset_name, module='data')
    with open(synset_path) as f:
        synset = eval(f.read())
    # print(synset)

    # Port GLuon model to portable computational graph
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)
    data_shape = (batch_size, ) + image_shape

    shape_dict = {'data': x.shape}
    mod, params = relay.frontend.from_mxnet(block, shape_dict)
    # we want a probability so add a softmax operator
    func = mod["main"]
    func = relay.Function(func.params, relay.nn.softmax(func.body), None,
                          func.type_params, func.attrs)

    # compile the graph to run on RaspPi modelB
    local_demo = False

    if local_demo:
        target = tvm.target.create('llvm')
    else:
        target = tvm.target.arm_cpu('rasp3b')

    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(func, target, params=params)

    # Save the library at local temporary directory.
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    # RPC server is running on the Rasp Pi.
    # Get the IP address of the Rasp Pi and connect to the machine to run the net compiled here with Relay.

    # obtain an RPC session from remote device.
    if local_demo:
        remote = rpc.LocalSession()
    else:
        # The following is my environment, change this to the IP address of your target device
        host = '192.168.0.10'
        port = 9090
        remote = rpc.connect(host, port)

    # upload the library to remote device and load it
    remote.upload(lib_fname)
    rlib = remote.load_module('net.tar')

    # create the remote runtime module
    ctx = remote.cpu(0)
    module = runtime.create(graph, rlib, ctx)
    # set parameter (upload params to the remote device. This may take a while)
    module.set_input(**params)
    # set input data
    module.set_input('data', tvm.nd.array(x.astype('float32')))
    # run
    module.run()
    # get output
    out = module.get_output(0)
    # get top1 result
    top1 = np.argmax(out.asnumpy())
    print('TVM prediction top-1: {}'.format(synset[top1]))
def deploy_rpc():
    """Runs the demo that deploys a model remotely through RPC.
    """
    from tvm import rpc
    from tvm.contrib import util, emscripten

    # As usual, load the resnet18 model.
    net, params, data_shape, out_shape = load_mxnet_resnet()

    # Compile the model.
    # Note that this time we are changing the target.
    # This is because we want to translate the host library into JavaScript
    # through Emscripten.
    graph, lib, params = compile_net(
        net,
        target_host="llvm -target=asmjs-unknown-emscripten -system-lib",
        target="opengl",
        data_shape=data_shape,
        params=params)

    # Now we want to deploy our model through RPC.
    # First we ned to prepare the module files locally.
    print("Saving the compiled module...")

    temp = util.tempdir()
    path_obj = temp.relpath("deploy.bc")  # host LLVM part
    path_dso = temp.relpath("deploy.js")  # host JavaScript part
    path_gl = temp.relpath("deploy.gl")  # device GLSL part
    path_json = temp.relpath("deploy.tvm_meta.json")

    lib.save(path_obj)
    emscripten.create_js(path_dso, path_obj, side_module=True)
    lib.imported_modules[0].save(path_gl)

    print("- Saved files:", temp.listdir())

    # Connect to the RPC server.
    print("Connecting to RPC server...")
    proxy_host = 'localhost'
    proxy_port = 9090
    remote = rpc.connect(proxy_host, proxy_port, key="js")
    print("- Connected to RPC server!")

    # Upload module to RPC server.
    print("Uploading module to RPC server...")
    remote.upload(path_dso, "deploy.dso")
    remote.upload(path_gl)
    remote.upload(path_json)
    print("- Upload completed!")

    # Load remote library.
    print("Loading remote library...")
    fdev = remote.load_module("deploy.gl")
    fhost = remote.load_module("deploy.dso")
    fhost.import_module(fdev)
    rlib = fhost
    print("- Remote library loaded!")

    ctx = remote.opengl(0)

    # Upload the parameters.
    print("Uploading parameters...")
    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
    print("- Parameters uploaded!")

    # Create the remote runtime module.
    print("Running remote module...")
    from tvm.contrib import graph_runtime
    module = graph_runtime.create(graph, rlib, ctx)

    # Set parameter.
    module.set_input(**rparams)

    # Set input data.
    input_data = np.random.uniform(size=data_shape)
    module.set_input('data', tvm.nd.array(input_data.astype('float32')))

    # Run.
    module.run()
    print("- Remote module execution completed!")

    out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
    # Print first 10 elements of output.
    print(out.asnumpy()[0][0:10])
Example #36
0
    trials = 50

    compute_graph = nnvm.graph.create(output)
    ctx = tvm.device("cuda", 0)
    params = generate_random_parameters(compute_graph,
                                        "data",
                                        data_shape,
                                        with_input=True,
                                        context=ctx)
    input_data = params["data"]
    deploy_graph, lib, params = nnvm.compiler.build(compute_graph,
                                                    target="cuda",
                                                    shape={"data": data_shape},
                                                    params=params)
    # print(deploy_graph.ir())
    module = graph_runtime.create(deploy_graph, lib, ctx)

    # warm-up
    module.run(data=input_data)
    output = module.get_output(0, None)
    # print(output.asnumpy())

    time_evaluator = module.module.time_evaluator("run",
                                                  ctx,
                                                  number=trials,
                                                  repeat=10)

    time_cost = time_evaluator().mean * 1e3

    print("time_cost=", time_cost, "ms")
Example #37
0
File: resnet.py Project: zhiics/tvm
######################################################################
# Build the ResNet Runtime
# ------------------------
# Build the ResNet graph runtime, and configure the parameters.

# Set ``device=vtacpu`` to run inference on the CPU
# or ``device=vta`` to run inference on the FPGA.
device = "vta"

# Device context
ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

# Build the graph runtime
graph, lib, params = generate_graph(os.path.join(data_dir, graph_fn),
                                    os.path.join(data_dir, params_fn), device)
m = graph_runtime.create(graph, lib, ctx)

# Set the parameters
m.set_input(**params)

######################################################################
# Run ResNet-18 inference on a sample image
# -----------------------------------------
# Perform image classification on test image.
# You can change the test image URL to any image of your choosing.

# Read in test image
image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
# Read in test image
response = requests.get(image_url)
image = Image.open(BytesIO(response.content)).resize((224, 224))
Example #38
0
def test_one_time(one_time_length=1000,
                  Test_sparse=True,
                  image_shape=(3, 32, 32)):
    # Hyper-parameter define
    batch_size = 1
    num_class = 10
    data_shape = (batch_size, ) + image_shape
    out_shape = (batch_size, num_class)
    sparse_kernel_shape = (batch_size, 12)
    dtype = "float32"

    data = sym.Variable("data")
    sparse_kernel = sym.Variable("sparse_kernel",
                                 init=np.random.randint(
                                     0, 2, sparse_kernel_shape).astype(dtype))
    if Test_sparse:
        y1 = sym.conv2d_sparse(data=data,
                               sparsity=sparse_kernel,
                               channels=12,
                               kernel_size=(3, 3),
                               padding=(0, 0),
                               use_bias=False,
                               out_layout='NCHW')
    else:
        y1 = sym.conv2d(data=data,
                        channels=10,
                        kernel_size=(3, 3),
                        padding=(0, 0),
                        use_bias=False,
                        out_layout='NCHW')
    # y = sym.flatten(y1)
    # y = sym.dense(y, units=10, use_bias=False)
    # y = sym.softmax(y)
    out = y1

    # Test Graph compilation
    # Once the API is well-defined, this part will be OK
    # g = graph.create(out)
    # print("-------------Starts----------------")
    # print(g.json())
    # print("-----------------------------------")
    # print(g.ir())
    # print("--------------Ends-----------------")

    # Create workload
    net, params = create_sparse_workload(out, batch_size, image_shape, dtype)
    # print("-------------Starts2---------------")
    # print(net.debug_str())
    # print(params)
    # print("--------------Ends2----------------")

    # Test Forward
    # NNVM-compiler build
    opt_level = 0
    target = tvm.target.mali()
    target_host = "llvm -target=aarch64-linux-gnu"
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(net,
                                                 target=target,
                                                 shape={"data": data_shape},
                                                 params=params,
                                                 target_host=target_host)

    tmp = util.tempdir()
    lib_fname = tmp.relpath("net.tar")
    lib.export_library(lib_fname)
    remote = rpc.connect('59.78.6.204', 9090)
    remote.upload(lib_fname)
    rlib = remote.load_module("net.tar")

    ctx = remote.cl(0)

    # create random input
    real_data = np.random.uniform(-1, 1, size=data_shape).astype(dtype)
    real_sparse_kernel = np.array(([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
                                     1]])).astype(dtype)
    # real_sparse_kernel = np.random.randint(0, 2, sparse_kernel_shape).astype(dtype)

    # print(real_data)
    # print(real_sparse_kernel)

    # create module
    module = graph_runtime.create(graph, rlib, ctx)
    # set input and parameters
    module.set_input("data", real_data)
    if Test_sparse:
        module.set_input("sparse_kernel", real_sparse_kernel)
        module.set_input(**params)

    # run
    # localtime = time.asctime(time.localtime(time.time()))
    # print("Start time:" + localtime)
    starttime = time.time()
    for _ in range(one_time_length):
        module.run()
    endtime = time.time()
    # localtime = time.asctime(time.localtime(time.time()))
    # print("End time:" + localtime)
    print(endtime - starttime)

    # get output
    out = module.get_output(0)
    # convert to numpy
    out.asnumpy()

    # Print first 10 elements of output
    # print("-------------Starts3---------------")
    # # print(out.asnumpy().flatten()[0:10])
    # print(out)
    # print("--------------Ends3----------------")

    return endtime - starttime
Example #39
0
def main():
    model = posenet.load_model(args.model)
    model = model.to(DEVICE).eval()
    output_stride = model.output_stride

    if args.output_dir:
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

    filenames = [
        f.path
        for f in os.scandir(args.image_dir)
        if f.is_file() and f.path.endswith((".png", ".jpg"))
    ]

    if args.use_tvm:
        import tvm
        from tvm.contrib import graph_runtime

        with open(args.tvm_graph) as f:
            tvm_graph = f.read()
        tvm_lib = tvm.runtime.load_module(args.tvm_lib)
        with open(args.tvm_params, "rb") as f:
            tvm_params = bytearray(f.read())
        ctx = tvm.cpu()
        module = graph_runtime.create(tvm_graph, tvm_lib, ctx)
        module.load_params(tvm_params)

    preprocessing_time = []
    inference_time = []
    processing_time = []

    for filename in tqdm(filenames, desc="Processed", unit="files"):
        start = now()
        input_image, draw_image, output_scale = posenet.read_imgfile(
            filename,
            scale_factor=args.scale_factor,
            output_stride=output_stride,
            resize=(args.processing_height, args.processing_width)
            if args.resize
            else None,
        )
        preprocessing_time.append(now() - start)

        start = now()
        with torch.no_grad():
            if args.use_tvm:
                input_data = tvm.nd.array(input_image)
                module.run(**{args.input_name: input_data})
                out = []
                for idx in range(module.get_num_outputs()):
                    res = (
                        torch.Tensor(module.get_output(idx).asnumpy())
                        .squeeze(0)
                        .to(DEVICE)
                    )
                    out.append(res)

            else:
                input_image = torch.Tensor(input_image).to(DEVICE)

                out = []
                for idx, res in enumerate(model(input_image)):
                    out.append(res.squeeze(0))

            inference_time.append(now() - start)

            (
                heatmaps_result,
                offsets_result,
                displacement_fwd_result,
                displacement_bwd_result,
            ) = out

            start = now()
            if args.decoder == "multi":
                (
                    pose_scores,
                    keypoint_scores,
                    keypoint_coords,
                ) = posenet.decode_multiple_poses(
                    heatmaps_result,
                    offsets_result,
                    displacement_fwd_result,
                    displacement_bwd_result,
                    output_stride,
                    max_pose_detections=10,
                    min_pose_score=0.25,
                )
            elif args.decoder == "single":
                (keypoints, pose_score, keypoint_scores) = posenet.decode_single_pose(
                    heatmaps_result, offsets_result, output_stride
                )
                pose_scores = np.asarray([pose_score])
                keypoint_scores = np.asarray([keypoint_scores])
                keypoint_coords = np.asarray([keypoints])

            else:
                raise NotImplementedError(
                    "The decoder {} is not implemented.".format(args.decoder)
                )
            processing_time.append(now() - start)

        keypoint_coords *= output_scale

        if args.output_dir:
            draw_image = posenet.draw_skel_and_kp(
                draw_image,
                pose_scores,
                keypoint_scores,
                keypoint_coords,
                min_pose_score=0.25,
                min_part_score=0.25,
            )

            cv2.imwrite(
                os.path.join(
                    args.output_dir, os.path.relpath(filename, args.image_dir)
                ),
                draw_image,
            )
            if args.save_keypoints:
                with open(
                    os.path.join(
                        args.output_dir,
                        os.path.relpath(filename, args.image_dir) + ".npy",
                    ),
                    "wb",
                ) as outfile:
                    np.save(
                        outfile,
                        list(zip(pose_scores, keypoint_scores, keypoint_coords)),
                    )

        if args.verbose:
            print("Results for image: %s" % filename)
            for point_idx in range(len(pose_scores)):
                if pose_scores[point_idx] == 0.0:
                    break
                print("Pose #%d, score = %f" % (point_idx, pose_scores[point_idx]))
                for keypoint_idx, (score, coord) in enumerate(
                    zip(keypoint_scores[point_idx, :], keypoint_coords[point_idx, :, :])
                ):
                    print(
                        "Keypoint %s, score = %f, coord = %s"
                        % (posenet.PART_NAMES[keypoint_idx], score, coord)
                    )

    avg_preprocessing_time = np.mean(preprocessing_time)
    avg_postprocessing_time = np.mean(processing_time)
    avg_inference_time = np.mean(inference_time)
    print("=" * 80)
    print(
        "Decoder: {}, TVM Runtime: {}, Resize to {}x{} HxW: {}".format(
            args.decoder,
            "enabled" if args.use_tvm else "disabled",
            args.processing_height,
            args.processing_width,
            "enabled" if args.resize else "disabled",
        )
    )
    print("-" * 80)

    print("Average pre-processing FPS: {:.2f}".format(1 / avg_preprocessing_time))
    print("Average inference FPS: {:.2f}".format(1 / avg_inference_time))
    print("Average post-processing FPS: {:.2f}".format(1 / avg_postprocessing_time))
    print(
        "Average FPS: {:.2f}".format(
            1 / (avg_postprocessing_time + avg_inference_time + avg_preprocessing_time)
        )
    )
Example #40
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        choices=['resnet', 'mobilenet'],
                        help="The model type.")
    parser.add_argument('--host',
                        type=str,
                        required=True,
                        help="The host address of your Raspberry Pi.")
    parser.add_argument('--port',
                        type=int,
                        required=True,
                        help="The port number of your Raspberry Pi.")
    parser.add_argument('--opt-level',
                        type=int,
                        default=1,
                        help="Level of optimization.")
    parser.add_argument('--num-iter',
                        type=int,
                        default=50,
                        help="Number of iteration during benchmark.")
    args = parser.parse_args()

    opt_level = args.opt_level
    target = "llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon"

    num_iter = args.num_iter
    batch_size = 1
    num_classes = 1000
    image_shape = (3, 224, 224)

    data_shape = (batch_size, ) + image_shape
    out_shape = (batch_size, num_classes)
    if args.model == 'resnet':
        net, params = nnvm.testing.resnet.get_workload(batch_size=1,
                                                       image_shape=image_shape)
    elif args.model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(args.model))

    with nnvm.compiler.build_config(opt_level=opt_level):
        with tvm.target.rasp():
            graph, lib, params = nnvm.compiler.build(
                net, target, shape={"data": data_shape}, params=params)

    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.o')
    lib.save(lib_fname)

    remote = rpc.connect(args.host, args.port)
    remote.upload(lib_fname)

    ctx = remote.cpu(0)
    rlib = remote.load_module('net.o')
    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

    module = runtime.create(graph, rlib, ctx)
    module.set_input(
        'data',
        tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32")))
    module.set_input(**rparams)
    module.run()
    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
    out.asnumpy()

    print('benchmark args: {}'.format(args))
    ftimer = module.module.time_evaluator("run", ctx, num_iter)
    for i in range(3):
        prof_res = ftimer()
        print(prof_res)
        # sleep for avoiding cpu overheat
        time.sleep(45)
Example #41
0
def tracer(module, info, is_before):
    pass
    #global timing
    #if bool(is_before):
    #    timing = time.time()
    #else:
    #    print('Executes: ', info.name, (time.time() - timing) * 1000)


passes = [(1, tensorizer.rewrite)]
with tvm.transform.PassContext(opt_level=4,
                               trace=tracer,
                               config={'tir.add_lower_pass': passes}):
    #with tvm.transform.PassContext(opt_level=4, trace=tracer):
    #graph, lib, params = tvm.relay.build(module, target='cuda -libs=cublas,cudnn')
    graph, lib, params = tvm.relay.build(module, target='nvptx')
    module = runtime.create(graph, lib, tvm.gpu())

    x_ = (np.random.randn(n, c, h, w) * 128).astype('float32')
    module.set_input('x', x_)

    timer = module.module.time_evaluator('run',
                                         ctx=tvm.gpu(),
                                         number=1,
                                         repeat=1)
    timed = timer()

    print((n * oc * (h - kh + 1) * (w - kw + 1)) * (kh * kw * ic) /
          timed.mean / 1e9)
    print('%d us' % int(timed.mean * 1e6))
Example #42
0
def test_vgg():
    def get_feature(internel_layer, layers, filters, batch_norm=False):
        """
		Get VGG feature body as stacks of convoltions.
		layers  : [1, 1, 2, 2, 2]
		filters : [64, 128, 256, 512, 512]
		"""
        for i, num in enumerate(layers):
            """
			i = 0, num = 1
			i = 1, num = 1
			i = 2, num = 2
			i = 3, num = 2
			i = 4, num = 2
			"""
            for j in range(num):
                internel_layer = sym.pad(data=internel_layer,
                                         pad_width=((0, 0), (1, 1), (1, 1),
                                                    (0, 0)))
                internel_layer = sym.conv2d(data=internel_layer,
                                            kernel_size=(3, 3),
                                            channels=filters[i],
                                            layout='NHWC',
                                            kernel_layout='HWOI',
                                            name="conv%s_%s" % (i + 1, j + 1))
                if batch_norm:
                    internel_layer = sym.batch_norm(data=internel_layer,
                                                    axis=3,
                                                    name="bn%s_%s" %
                                                    (i + 1, j + 1))
                internel_layer = sym.relu(data=internel_layer,
                                          name="relu%s_%s" % (i + 1, j + 1))

            internel_layer = sym.max_pool2d(data=internel_layer,
                                            pool_size=(2, 2),
                                            strides=(2, 2),
                                            layout="NHWC",
                                            name="pool%s" % (i + 1))
            return internel_layer

    def get_classifier(input_data, num_classes):
        """
		Get VGG classifier layers as fc layers.
		"""
        flatten = sym.flatten(data=input_data, name="flatten")
        fc1 = sym.dense(data=flatten, units=32, name="fc1")
        relu1 = sym.relu(data=fc1, name="relu1")
        drop1 = sym.dropout(data=relu1, rate=0.5, name="drop1")
        fc2 = sym.dense(data=drop1, units=32, name="fc2")
        relu2 = sym.relu(data=fc2, name="relu2")
        drop2 = sym.dropout(data=relu2, rate=0.5, name="drop2")
        fc3 = sym.dense(data=drop2, units=num_classes, name="fc3")
        return fc3

    def get_symbol(datas, num_classes, num_layers=11, batch_norm=False):
        """
		Parameters
		------------
		num_classes     : int, default 16
						Number of classification classes

		num_layers      : int
						Number of layers for the variant of vgg. Options are 11, 13, 16, 19

		batch_norm      : bool, default False
						Use batch normalization.

		"""
        vgg_spec = {
            11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
            13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
            16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
            19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])
        }

        if num_layers not in vgg_spec:
            raise ValueError(
                "Invalide num_layers {}. Choices are 11, 13, 16, 19.".format(
                    num_layers))
        layers, filters = vgg_spec[num_layers]
        feature = get_feature(datas, layers, filters, batch_norm)
        classifier = get_classifier(feature, num_classes)
        symbol = sym.softmax(data=classifier, name="softmax")
        return symbol

    input_shape = (1, 224, 224, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 1
    z = get_symbol(datas=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    print(compute_graph.ir())
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            nnpu.set_device(nnpu.get_env(), type='SC')
            with ScheduleProcHelper():
                with nnpu.build_config():
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.uniform(size=input_shape, low=-32,
                                 high=32).astype(np.float32)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        # module.run()
        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean * 10)
Example #43
0
def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
    in_array = np.random.uniform(size=shape).astype(dtype)

    if alpha == None and beta == None and bias == None:
        alpha = 0.0001
        beta = 0.75
        bias = 1.0
        node = onnx.helper.make_node('LRN',
                                     inputs=['in'],
                                     outputs=['out'],
                                     size=nsize)
    else:
        node = onnx.helper.make_node('LRN',
                                     inputs=['in'],
                                     outputs=['out'],
                                     alpha=alpha,
                                     beta=beta,
                                     bias=bias,
                                     size=nsize)

    graph = helper.make_graph(
        [node],
        "lrn_test",
        inputs=[
            helper.make_tensor_value_info("in", TensorProto.FLOAT, list(shape))
        ],
        outputs=[
            helper.make_tensor_value_info("out", TensorProto.FLOAT,
                                          list(shape))
        ])
    model = helper.make_model(graph, producer_name='lrn_test')

    def _get_python_lrn():
        square_sum = np.zeros(shape).astype(dtype)
        for n, c, h, w in np.ndindex(in_array.shape):
            square_sum[n, c, h, w] = sum(in_array[n,
                                         max(0, c - int(math.floor((nsize - 1) / 2))): \
                                             min(5, c + int(math.ceil((nsize - 1) / 2)) + 1),
                                         h,
                                         w] ** 2)
        py_out = in_array / ((bias + (alpha / nsize) * square_sum)**beta)
        return py_out

    for target, ctx in ctx_list():
        new_sym, params = nnvm.frontend.from_onnx(model)

        input_name = model.graph.input[0].name
        shape_dict = {input_name: in_array.shape}
        dtype_dict = {input_name: dtype}
        graph, lib, params = nnvm.compiler.build(new_sym,
                                                 target,
                                                 shape_dict,
                                                 dtype_dict,
                                                 params=params)
        m = graph_runtime.create(graph, lib, ctx)
        # set inputs
        m.set_input(input_name, tvm.nd.array(in_array.astype(dtype)))
        m.set_input(**params)
        m.run()
        # get outputs
        tvm_out = m.get_output(0, tvm.nd.empty(shape, dtype))
        py_out = _get_python_lrn()
        tvm.testing.assert_allclose(py_out,
                                    tvm_out.asnumpy(),
                                    rtol=1e-5,
                                    atol=1e-5)
Example #44
0
def verify_model(model_name,
                 input_data=[],
                 custom_convert_map={},
                 ctx_list=ctx_list()):
    """Assert that the output of a compiled model matches with that of its
    baseline."""
    if isinstance(model_name, str):
        baseline_model, baseline_input = load_model(model_name)
    elif isinstance(input_data, list):
        baseline_model = model_name
        baseline_input = input_data
    elif isinstance(input_data, torch.Tensor) or len(input_data.shape) == 0:
        baseline_model = model_name
        baseline_input = [input_data]
    else:
        assert False, "Unexpected input format"

    if torch.cuda.is_available():
        baseline_model = baseline_model.cuda()
        baseline_input = [inp.cuda() for inp in baseline_input]

    with torch.no_grad():
        baseline_outputs = baseline_model(*baseline_input)

    if isinstance(baseline_outputs, tuple):
        baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs)
    else:
        baseline_outputs = (baseline_outputs.float().cpu().numpy(), )

    trace = torch.jit.trace(baseline_model, baseline_input).float().eval()

    if torch.cuda.is_available():
        trace = trace.cuda()
    else:
        trace = trace.cpu()

    input_names = [
        "input{}".format(idx) for idx, inp in enumerate(baseline_input)
    ]
    input_shapes = list(zip(input_names,
                            [inp.shape for inp in baseline_input]))
    mod, params = relay.frontend.from_pytorch(trace, input_shapes,
                                              custom_convert_map)
    compiled_input = dict(
        zip(input_names, [inp.cpu().numpy() for inp in baseline_input]))

    with relay.build_config(opt_level=3):
        for target, ctx in ctx_list:
            relay_graph, relay_lib, relay_params = relay.build(mod,
                                                               target=target,
                                                               params=params)
            relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
            relay_model.set_input(**relay_params)
            for name, inp in compiled_input.items():
                relay_model.set_input(name, inp)
            relay_model.run()

            for i, baseline_output in enumerate(baseline_outputs):
                compiled_output = relay_model.get_output(i).asnumpy()

                assert_shapes_match(baseline_output, compiled_output)
                tvm.testing.assert_allclose(baseline_output,
                                            compiled_output,
                                            rtol=1e-3,
                                            atol=1e-3)

    del model_name
    del baseline_model
    torch.cuda.empty_cache()
Example #45
0
def run_tvm(data, symbol_file, num_inference_images, sym, devs, label_name):
    debug = False
    import tvm
    from tvm.contrib import graph_runtime
    from tvm.contrib.debugger import debug_runtime as debug_runtime

    base = './compiled/' + symbol_file.split('/')[-1].replace('.json', '')

    path_lib = base + '_deploy_lib.tar'
    path_graph = base + '_deploy_graph.json'
    path_params = base + '_deploy_params.params'

    graph = open(path_graph).read()
    lib = tvm.runtime.load_module(path_lib)
    params = bytearray(open(path_params, 'rb').read())

    if debug:
        rt_mod = debug_runtime.create(graph, lib, ctx=tvm.cpu(0))
        mod = mx.mod.Module(symbol=sym, context=devs)
        mod.bind(for_training=False, data_shapes=data.provide_data)
    else:
        rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
        mod = mx.mod.Module(symbol=sym,
                            context=devs,
                            label_names=[
                                label_name,
                            ])
        mod.bind(for_training=False,
                 data_shapes=data.provide_data,
                 label_shapes=data.provide_label)

    rt_mod.load_params(params)
    mod.set_params(arg_params, aux_params)

    counter = 0
    top_1_raw = 0
    top_5_raw = 0
    top_1_raw_mxnet = 0
    top_5_raw_mxnet = 0
    if debug:
        data = advance_data_iter(data, 0)
    for batch in data:
        # Get the original label.
        correct_label = int(batch.label[0].asnumpy()[0])

        rt_mod.set_input('data', batch.data[0].asnumpy())
        rt_mod.run()
        if debug:
            np.set_printoptions(suppress=False)
            for i in rt_mod.debug_datum.get_output_tensors().keys():
                print(i, rt_mod.debug_get_output(i))
            return
        tvm_res = rt_mod.get_output(0).asnumpy()

        mod.forward(batch, is_train=False)
        mxnet_res = mod.get_outputs()[0].asnumpy()

        if debug:
            print("######## MxNet ###########")
            print(mxnet_res[0][0])
            print("######## TVM ###########")
            print(tvm_res[0][0])
            print("############################")
            print("############################")
            print("############################")
            print("############################")
            print("############################")
            print("############################")
            print("############################")
            print("############################")
            print("############################")
            print("######## MxNet ###########")
            print(mxnet_res)
            print("######## TVM ###########")
            print(tvm_res)
            #print("######## Diff ###########")
            # it = np.nditer(mxnet_res, flags=['multi_index'])
            # while not it.finished:
            #     print("%d <%s>" % (it[0], it.multi_index), end='\n')
            #     it.iternext()
            np.testing.assert_allclose(mxnet_res.astype('int32'),
                                       tvm_res.astype('int32'),
                                       atol=0,
                                       verbose=True)
            try:
                np.testing.assert_allclose(mxnet_res.astype('int32'),
                                           tvm_res.astype('int32'),
                                           atol=0,
                                           verbose=True)
            except:
                np.testing.assert_allclose(mxnet_res.astype('int32'),
                                           tvm_res.astype('int32'),
                                           atol=1,
                                           verbose=True)
        else:
            tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]
            mxnet_pred = np.squeeze(mxnet_res).argsort()[-5:][::-1]

            if correct_label == tvm_pred[0]:
                top_1_raw += 1
                top_5_raw += 1
            elif correct_label in tvm_pred:
                top_5_raw += 1

            if correct_label == mxnet_pred[0]:
                top_1_raw_mxnet += 1
                top_5_raw_mxnet += 1
            elif correct_label in mxnet_pred:
                top_5_raw_mxnet += 1

        counter += 1
        if counter == num_inference_images:
            break

    model_name = symbol_file.split('/')[-1].replace('.json', '')
    top_1 = float(top_1_raw_mxnet) / float(counter)
    top_5 = float(top_5_raw_mxnet) / float(counter)
    print("Mxnet", model_name, top_1, top_5, sep='\t')

    top_1 = float(top_1_raw) / float(counter)
    top_5 = float(top_5_raw) / float(counter)
    print("Tvm", model_name, top_1, top_5, sep='\t')
Example #46
0
def test_tflite_anistropic_strides():
    with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d):

        # uint8 input
        data_shape = (1, 1, 3, 6)
        data_dtype = "uint8"
        kernel_shape = (1, 1, 2, 2)
        kernel_dtype = "uint8"
        ref_func, qnn_func = get_funcs(
            data_shape=data_shape,
            data_dtype=data_dtype,
            kernel_shape=kernel_shape,
            kernel_dtype=kernel_dtype,
            input_zero_point=127,
            kernel_zero_point=127,
            input_scale=1.0,
            kernel_scale=1.0,
            kernel_size=(2, 2),
            padding=(0, 0),
            strides=(1, 3),
            dilation=(1, 1),
            data_layout="NCHW",
            kernel_layout="OIHW",
            out_dtype="int32",
        )
        golden_data = np.array(
            (
                133,
                131,
                129,
                125,
                123,
                121,
                135,
                133,
                131,
                123,
                121,
                119,
                137,
                135,
                133,
                121,
                119,
                117,
            )
        ).reshape(data_shape)
        golden_data = golden_data.astype("uint8")
        golden_weight = np.array((129, 131, 133, 135)).reshape(kernel_shape)
        golden_weight = golden_weight.astype("uint8")

        with tvm.transform.PassContext(opt_level=2):
            params = {"kernel": golden_weight}
            graph, lib, params = relay.build(qnn_func, "llvm", params=params)
            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
            mod.set_input("data", golden_data)
            mod.set_input(**params)
            mod.run()
            qnn_output = mod.get_output(0).asnumpy()
        golden_output = np.array((124, -92, 164, -132)).reshape(1, 1, 2, 2)
        np.testing.assert_equal(qnn_output, golden_output)
Example #47
0
    # The following is my environment, change this to the IP address of your target device
    host = '127.0.0.1'
    port = 9090
    remote = rpc.connect(host, port)

path = "deploy_lib.tar"
remote.upload(path)
remote_lib = remote.load_module(path)

ctx = remote.gpu()

# load the module back.
loaded_graph = open("deploy_graph.json").read()
loaded_params = bytearray(open("deploy_param.params", "rb").read())

module = runtime.create(loaded_graph, remote_lib, ctx)

# set parameter (upload params to the remote device. This may take a while)
input_name = 'input_1'
input_data = tvm.nd.array(x.astype(dtype))

# module.set_input(**loaded_params)
# module.set_input(input_name, tvm.nd.array(x.astype(dtype)))
# module.run()

module.load_params(loaded_params)
module.set_input(input_name, tvm.nd.array(
    x.astype(dtype)))  # key = input_name, value = array
module.run()

# get output
def tune_and_evaluate(tuning_opt):

    if env.TARGET != "sim":
        # Get remote from fleet node
        remote = autotvm.measure.request_remote(env.TARGET,
                                                tracker_host,
                                                tracker_port,
                                                timeout=10000)
        # Reconfigure the JIT runtime and FPGA.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
    else:
        # In simulation mode, host the RPC server locally.
        remote = rpc.LocalSession()

    # Register VTA tuning tasks
    register_vta_tuning_tasks()

    # Perform task extraction on Relay program
    print("Extract tasks...")
    relay_prog, params = compile_model()
    tasks = autotvm.task.extract_from_program(func=relay_prog,
                                              params=params,
                                              ops=(tvm.relay.op.nn.conv2d, ),
                                              target=target,
                                              target_host=env.target_host)

    # We should have extracted 10 convolution tasks
    assert len(tasks) == 10
    print("Extracted {} conv2d tasks:".format(len(tasks)))
    for tsk in tasks:
        print("\t{}".format(tsk))

    # We do not run the tuning in our webpage server since it takes too long.
    # Comment the following line to run it by yourself.
    # return

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[log_file]):
        # Compile network
        print("Compile...")
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
                graph, lib, params = relay.build(relay_prog,
                                                 target=target,
                                                 params=params,
                                                 target_host=env.target_host)
            else:
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)

        # Export library
        print("Upload...")
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Generate the graph runtime
        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
        m = graph_runtime.create(graph, lib, ctx)

        # upload parameters to device
        image = tvm.nd.array(
            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
        m.set_input(**params)
        m.set_input('data', image)

        # evaluate
        print("Evaluate inference time cost...")
        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
        tcost = timer()
        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
    batchsize = 1
    total_time_ms = 0
    global_step = 0

    config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=28,
                                      inter_op_parallelism_threads=1)

    # load the module back.
    path_lib = "./export/deploy_lib.tar"
    loaded_json = open("./export/deploy_graph.json").read()
    loaded_lib = tvm.runtime.load_module(path_lib)
    loaded_params = bytearray(
        open("./export/deploy_param.params", "rb").read())

    ctx = tvm.cpu()
    module = graph_runtime.create(loaded_json, loaded_lib, ctx)
    module.load_params(loaded_params)

    with tf.compat.v1.Session(config=config) as sess:
        # saver = tf.train.import_meta_graph(os.path.join(base_path,"train_data/checkPoint/trainModel.meta"))
        # saver.restore(sess, tf.train.latest_checkpoint(os.path.join(base_path,"train_data/checkPoint")))

        with gfile.FastGFile(
                os.path.join(base_path, "pb_models") + '/freeze_fp32.pb',
                'rb') as f:
            graph_def = tf.compat.v1.GraphDef()
            graph_def.ParseFromString(f.read())
            for node in graph_def.node:
                print("node name is: {} \t node op is: {}".format(
                    node.name, node.op))
            sess.graph.as_default()
Example #50
0
def graph_to_function(graph, target, ctx, shape=None, dtype=None):
    """Convert a graph to a function taking a keyword args and returning a list of results
    (both args and results are numpy arrays).

    Example::

        fun = graph_to_function(graph, llvm, cpu(0))
        [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,)))

    Parameters
    ----------
    graph : nnvm.graph.Graph
        A graph we want to convert to a function.

    target : str or :any:`tvm.target.Target`
        The build target

    ctx : TVMContext
        The context to deploy the module.

    shape : Dict[str, Tuple[int]], optional
        A dict mapping input variable names to shapes.
        By default shapes will be inferred from variables' attributes.
        Note that this parameter takes precedence over variables' attributes.

    dtype : Dict[str, str] or str, optional
        A dict mapping input variable names to dtypes, or just a single dtype.
        By default dtypes will be inferred from variables' attributes.
        Note that this parameter takes precedence over variables' attributes.

    Returns
    -------
    function : Callable[..., List[numpy.ndarray]]
    """
    # Infer missing shapes and dtypes
    graph, shape, dtype, output_shapes, output_dtypes = \
        infer_shapes_dtypes(graph, shape=shape, dtype=dtype)

    if None in dtype.values():
        raise ValueError("Input variables with no type: {}".format(dtype))

    if not all(shape.values()):
        raise ValueError("Input variables with no shape: {}".format(shape))

    compute_graph, lib, params = nnvm.compiler.build(graph,
                                                     target,
                                                     shape=shape,
                                                     dtype=dtype)
    module = graph_runtime.create(compute_graph, lib, ctx)

    if params:
        module.set_inputs(**params)

    def run(**kwargs):
        module.run(**kwargs)
        res = []
        for i, (o_shape,
                o_dtype) in enumerate(zip(output_shapes, output_dtypes)):
            res.append(
                module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy())
        return res

    return run
# With RPC, you can deploy the model remotely from your host machine
# to the remote device.

# obtain an RPC session from remote device.
if local_demo:
    remote = rpc.LocalSession()
else:
    # The following is my environment, change this to the IP address of your target device
    host = '10.77.1.162'
    port = 9090
    remote = rpc.connect(host, port)

# upload the library to remote device and load it
remote.upload(lib_fname)
rlib = remote.load_module('net.tar')

# create the remote runtime module
ctx = remote.cpu(0)
module = runtime.create(graph, rlib, ctx)
# set parameter (upload params to the remote device. This may take a while)
module.set_input(**params)
# set input data
module.set_input('data', tvm.nd.array(x.astype('float32')))
# run
module.run()
# get output
out = module.get_output(0)
# get top1 result
top1 = np.argmax(out.asnumpy())
print('TVM prediction top-1: {}'.format(synset[top1]))
Example #52
0
        lib_name = "main.so"
    elif platform.system() == "Windows":
        lib_name = "main.dll"
    else:
        raise Exception("unknown system " + platform.system())

    print("export_library main lib")
    lib.export_library(lib_name)

    # or save object file for deploy usage
    # lib.save(os.path.join(work_root, binary_dir, 'model.o'))

    print("load main lib")
    sysLib = tvm.runtime.load_module(lib_name)

    ctx = tvm.cpu(0)

    input_data = np.random.random(dshape).astype(np.float32)

    for fk in ret_mods:
        mg = ret_mods[fk].get_json()
        mp = ret_mods[fk].get_params()
        print("test " + fk + "   ------------------------------------")
        module = graph_runtime.create(mg, sysLib, ctx)
        module.load_params(relay.save_param_dict(mp))
        module.set_input("data", tvm.nd.array(input_data))
        module.run()
        num_output = module.get_num_outputs()
        for idx in range(num_output):
            print(module.get_output(idx).shape)
Example #53
0
    fo.write(nnvm.compiler.save_param_dict(params))
print(temp.listdir())

######################################################################
# Deploy locally to Nvidia GPU
# ------------------------------
# Now we can load the module back.

import numpy as np
from tvm.contrib import graph_runtime

loaded_lib = tvm.module.load(path_lib)
loaded_json = open(temp.relpath("deploy_graph.json")).read()
loaded_params = bytearray(
    open(temp.relpath("deploy_param.params"), "rb").read())
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
module.load_params(loaded_params)

input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
module.run(data=input_data)
out = module.get_output(0, out=tvm.nd.empty(out_shape))
# Print first 10 elements of output
print(out.asnumpy()[0][0:10])

######################################################################
# Compile and Deploy the Model to Raspberry Pi Remotely with RPC
# --------------------------------------------------------------
# Following the steps above, we can also compile the model for Raspberry Pi.
# TVM provides rpc module to help with remote deploying.
#
# For demonstration, we simply start an RPC server on the same machine,
Example #54
0
def test_tensorrt_image_classification_models():
    def compile_model(graph,
                      params,
                      data_shapes,
                      subgraph_backend=None,
                      op_names=None,
                      **kwargs):
        _, output_shapes = nnvm.compiler.graph_util.infer_shape(
            graph, **data_shapes)
        assert len(output_shapes) == 1
        flags = kwargs
        if subgraph_backend is not None and op_names is not None:
            graph = nnvm.subgraph._partition(graph, subgraph_backend, op_names)
            flags = {}
        target = tvm.target.cuda()
        with nnvm.compiler.build_config(opt_level=3, **flags):
            graph, lib, params = nnvm.compiler.build(graph,
                                                     target,
                                                     shape=data_shapes,
                                                     params=params)
        return graph, lib, params, output_shapes[0]

    def get_output(module, data, params, output_shape):
        module.set_input("data", data)
        module.set_input(**params)
        module.run()
        return module.get_output(0).asnumpy()
        out = module.get_output(0, tvm.nd.empty(output_shape))
        return out.asnumpy()

    def copy_params(params):
        new_params = {}
        for k, v in params.items():
            new_params[k] = tvm.nd.array(v)
        return new_params

    def check_trt_model(baseline_module,
                        baseline_params,
                        graph,
                        params,
                        data_shape,
                        subgraph_backend=None,
                        op_names=None,
                        **kwargs):
        trt_graph, trt_lib, trt_params, output_shape = compile_model(
            graph, params, {'data': data_shape}, subgraph_backend, op_names,
            **kwargs)
        data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
        baseline_out = get_output(baseline_module, data, baseline_params,
                                  output_shape)
        trt_module = graph_runtime.create(trt_graph, trt_lib, tvm.gpu())
        trt_out = get_output(trt_module, data, trt_params, output_shape)
        np.testing.assert_almost_equal(baseline_out, trt_out, decimal=5)

    workload_dict = {
        'resnet': nnvm.testing.resnet.get_workload,
        'inception_v3': nnvm.testing.inception_v3.get_workload,
        'mobilenet': nnvm.testing.mobilenet.get_workload,
        'mobilenet_v2': nnvm.testing.mobilenet_v2.get_workload,
        'squeezenet': nnvm.testing.squeezenet.get_workload,
        'vgg': nnvm.testing.vgg.get_workload,
        'densenet': nnvm.testing.densenet.get_workload
    }
    for model_name, get_workload in workload_dict.items():
        logging.info('Testing TensorRT for model %s' % model_name)
        flags = {
            'batch_size': 1,
            'image_shape': (3, 224, 224),
            'num_classes': 100
        }
        if model_name == 'inception_v3':
            flags['image_shape'] = (3, 299, 299)
        if model_name.startswith('resnet'):
            flags['num_layers'] = 18
        data_shape = (flags['batch_size'], ) + flags['image_shape']
        if model_name == 'mobilenet_v2' or model_name == 'densenet':
            flags.pop('image_shape')
        net, params = get_workload(**flags)
        graph_json_str = nnvm.graph.create(net).json()
        with nnvm.compiler.build_config(opt_level=3):
            baseline_graph, baseline_lib, baseline_params = nnvm.compiler.build(
                nnvm.graph.load_json(graph_json_str),
                tvm.target.cuda(),
                shape={'data': data_shape},
                params=copy_params(params))
        baseline_module = graph_runtime.create(baseline_graph, baseline_lib,
                                               tvm.gpu())

        # test whole graph run using tensorrt, nnvm.compiler.build_config has graph partitioning turned on
        check_trt_model(baseline_module,
                        baseline_params,
                        nnvm.graph.load_json(graph_json_str),
                        copy_params(params),
                        data_shape,
                        ext_accel='tensorrt')
Example #55
0
def run_e2e(graph):
    """Running end to end example
    """

    import json

    if debug_fpga_only:
        graph = mark_nop(graph, skip_conv_layer=(0, ))
    dt = time.time()
    m = graph_runtime.create(graph, lib, ctx)
    timers['execution_time_create_run_time_graph'] = (time.time() - dt)

    total_images = 0
    correct_images_top1 = 0
    correct_images_top5 = 0

    # Shuffle files and pre-read JSON with accuracy to continue aggregating it
    # otherwise if FPGA board hangs, we can continue checking random images ...

    import random
    random.shuffle(files)

    if len(files) > 1 and os.path.isfile('aggregate-ck-timer.json'):
        x = json.load(open('aggregate-ck-timer.json'))

        if 'total_images' in x:
            total_images = x['total_images']
        if 'correct_images_top1' in x:
            correct_images_top1 = x['correct_images_top1']
        if 'correct_images_top5' in x:
            correct_images_top5 = x['correct_images_top5']

    dt1 = time.time()
    for f in files:
        total_images += 1

        print(
            '==============================================================================='
        )
        print('Image ' + str(total_images) + ' of ' + str(len(files)) + ' : ' +
              f)

        image = Image.open(os.path.join(f)).resize((224, 224))
        if image.mode != 'RGB': image = image.convert('RGB')
        img = transform_image(image)

        # set inputs
        m.set_input('data', tvm.nd.array(img.astype("float32")))
        m.set_input(**params)

        # execute
        print('')
        print("run (" + str(STAT_REPEAT) + " statistical repetitions)")
        dt = time.time()
        timer = m.module.time_evaluator("run", ctx, number=STAT_REPEAT)
        tcost = timer()
        timers['execution_time_classify'] = (time.time() - dt) / STAT_REPEAT

        # get outputs
        tvm_output = m.get_output(0,
                                  tvm.nd.empty((1000, ), dtype, remote.cpu(0)))

        top1 = np.argmax(tvm_output.asnumpy())

        top5 = []
        atop5 = get_top5(tvm_output.asnumpy())

        print('')
        print('TVM prediction Top1:', top1, synset[top1])

        print('')
        print('TVM prediction Top5:')
        for q in atop5:
            x = q[1]
            y = synset[x]
            top5.append(x)
            print(x, y)

        print('')
        print("Internal T-cost: %g" % tcost.mean)

        # Check correctness if available
        if len(val) > 0:
            top = val[os.path.basename(f)]

            correct_top1 = False
            if top == top1:
                correct_top1 = True
                correct_images_top1 += 1

            print('')
            if correct_top1:
                print('Current prediction Top1: CORRECT')
            else:
                print('Current prediction Top1: INCORRECT +(' + str(top) + ')')

            accuracy_top1 = float(correct_images_top1) / float(total_images)
            print('Current accuracy Top1:   ' + ('%.5f' % accuracy_top1))

            correct_top5 = False
            if top in top5:
                correct_top5 = True
                correct_images_top5 += 1

            print('')
            if correct_top5:
                print('Current prediction Top5: CORRECT')
            else:
                print('Current prediction Top5: INCORRECT +(' + str(top) + ')')

            accuracy_top5 = float(correct_images_top5) / float(total_images)
            print('Current accuracy Top5:   ' + ('%.5f' % accuracy_top5))

            print('')
            print('Total elapsed time: ' + ('%.1f' % (time.time() - dt1)) +
                  ' sec.')

            timers['total_images'] = total_images
            timers['correct_images_top1'] = correct_images_top1
            timers['accuracy_top1'] = accuracy_top1
            timers['correct_images_top5'] = correct_images_top5
            timers['accuracy_top5'] = accuracy_top5

        timers['execution_time_classify_internal'] = tcost.mean
        timers['execution_time'] = tcost.mean

        with open('tmp-ck-timer.json', 'w') as ftimers:
            json.dump(timers, ftimers, indent=2)

        with open('aggregate-ck-timer.json', 'w') as ftimers:
            json.dump(timers, ftimers, indent=2)

        sys.stdout.flush()
Example #56
0
def run_case(dtype, image, target):
    # Check image
    import os
    import json
    import sys

    STAT_REPEAT = os.environ.get('STAT_REPEAT', '')
    if STAT_REPEAT == '' or STAT_REPEAT == None:
        STAT_REPEAT = 10
    STAT_REPEAT = int(STAT_REPEAT)

    # FGG: set model files via CK env
    CATEG_FILE = '../synset.txt'
    synset = eval(open(os.path.join(CATEG_FILE)).read())

    files = []
    val = {}

    if image != None and image != '':
        files = [image]
    else:
        ipath = os.environ.get('CK_ENV_DATASET_IMAGENET_VAL', '')
        if ipath == '':
            print('Error: path to ImageNet dataset is not set!')
            exit(1)
        if not os.path.isdir(ipath):
            print('Error: path to ImageNet dataset was not found!')
            exit(1)

        # get all files
        d = os.listdir(ipath)
        for x in d:
            x1 = x.lower()
            if x1.startswith('ilsvrc2012_val_'):
                files.append(os.path.join(ipath, x))

        files = sorted(files)

        STAT_REPEAT = 1

        # Get correct labels
        ival = os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT', '')
        fval = open(ival).read().split('\n')

        val = {}
        for x in fval:
            x = x.strip()
            if x != '':
                y = x.split(' ')
                val[y[0]] = int(y[1])

    # FGG: set timers
    import time
    timers = {}

    # Get first shape (expect that will be the same for all)
    dt = time.time()
    image = Image.open(os.path.join(files[0])).resize((224, 224))
    if image.mode != 'RGB': image = image.convert('RGB')
    timers['execution_time_load_image'] = time.time() - dt

    dt = time.time()
    img = transform_image(image)
    timers['execution_time_transform_image'] = time.time() - dt

    # load model
    from mxnet.gluon.model_zoo.vision import get_model
    from mxnet.gluon.utils import download

    model_path = os.environ['CK_ENV_MODEL_MXNET']
    model_id = os.environ['MXNET_MODEL_ID']
    block = get_model(model_id, pretrained=True, root=model_path)

    # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
    net, params = nnvm.frontend.from_mxnet(block)
    # we want a probability so add a softmax operator
    net = nnvm.sym.softmax(net)

    # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3)
    if dtype != 'float32':
        params = {
            k: tvm.nd.array(v.asnumpy().astype(dtype))
            for k, v in params.items()
        }

    # compile
    if target == None or target == 'cpu':
        xtarget = 'llvm'
    elif target == 'cuda':
        xtarget = 'cuda'

    opt_level = 2 if dtype == 'float32' else 1
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(net,
                                                 target=xtarget,
                                                 shape={"data": data_shape},
                                                 params=params,
                                                 dtype=dtype,
                                                 target_host=None)

    # upload model to remote device
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    if target == None or target == 'cpu':
        ctx = tvm.cpu(0)
    elif target == 'cuda':
        ctx = tvm.gpu(0)
    rlib = lib
    rparams = params

    # create graph runtime
    dt = time.time()
    module = runtime.create(graph, rlib, ctx)
    module.set_input(
        'data',
        tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype)))
    module.set_input(**rparams)
    timers['execution_time_create_run_time_graph'] = (time.time() - dt)

    total_images = 0
    correct_images_top1 = 0
    correct_images_top5 = 0

    # Shuffle files and pre-read JSON with accuracy to continue aggregating it
    # otherwise if FPGA board hangs, we can continue checking random images ...

    import random
    random.shuffle(files)

    if len(files) > 1 and os.path.isfile('aggregate-ck-timer.json'):
        x = json.load(open('aggregate-ck-timer.json'))

        if 'total_images' in x:
            total_images = x['total_images']
        if 'correct_images_top1' in x:
            correct_images_top1 = x['correct_images_top1']
        if 'correct_images_top5' in x:
            correct_images_top5 = x['correct_images_top5']

    dt1 = time.time()
    for f in files:
        total_images += 1

        print(
            '==============================================================================='
        )
        print('Image ' + str(total_images) + ' of ' + str(len(files)) + ' : ' +
              f)

        image = Image.open(os.path.join(f)).resize((224, 224))
        if image.mode != 'RGB': image = image.convert('RGB')
        img = transform_image(image)

        # set inputs
        module.set_input('data', tvm.nd.array(img.astype(dtype)))
        module.set_input(**rparams)

        # perform some warm up runs
        # print("warm up..")
        warm_up_timer = module.module.time_evaluator("run", ctx, 1)
        warm_up_timer()

        # execute
        print('')
        print("run (" + str(STAT_REPEAT) + " statistical repetitions)")
        dt = time.time()
        timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT)
        tcost = timer()
        timers['execution_time_classify'] = (time.time() - dt) / STAT_REPEAT

        # get outputs
        tvm_output = module.get_output(0, tvm.nd.empty((1000, ), dtype, ctx))

        top1 = np.argmax(tvm_output.asnumpy())

        top5 = []
        atop5 = get_top5(tvm_output.asnumpy())

        print('')
        print('TVM prediction Top1:', top1, synset[top1])

        print('')
        print('TVM prediction Top5:')
        for q in atop5:
            x = q[1]
            y = synset[x]
            top5.append(x)
            print(x, y)

        print('')
        print("Internal T-cost: %g" % tcost.mean)

        # Check correctness if available
        if len(val) > 0:
            top = val[os.path.basename(f)]

            correct_top1 = False
            if top == top1:
                correct_top1 = True
                correct_images_top1 += 1

            print('')
            if correct_top1:
                print('Current prediction Top1: CORRECT')
            else:
                print('Current prediction Top1: INCORRECT +(' + str(top) + ')')

            accuracy_top1 = float(correct_images_top1) / float(total_images)
            print('Current accuracy Top1:   ' + ('%.5f' % accuracy_top1))

            correct_top5 = False
            if top in top5:
                correct_top5 = True
                correct_images_top5 += 1

            print('')
            if correct_top5:
                print('Current prediction Top5: CORRECT')
            else:
                print('Current prediction Top5: INCORRECT +(' + str(top) + ')')

            accuracy_top5 = float(correct_images_top5) / float(total_images)
            print('Current accuracy Top5:   ' + ('%.5f' % accuracy_top5))

            print('')
            print('Total elapsed time: ' + ('%.1f' % (time.time() - dt1)) +
                  ' sec.')

            timers['total_images'] = total_images
            timers['correct_images_top1'] = correct_images_top1
            timers['accuracy_top1'] = accuracy_top1
            timers['correct_images_top5'] = correct_images_top5
            timers['accuracy_top5'] = accuracy_top5

        timers['execution_time_classify_internal'] = tcost.mean
        timers['execution_time'] = tcost.mean

        with open('tmp-ck-timer.json', 'w') as ftimers:
            json.dump(timers, ftimers, indent=2)

        with open('aggregate-ck-timer.json', 'w') as ftimers:
            json.dump(timers, ftimers, indent=2)

        sys.stdout.flush()

    return
Example #57
0
def run_unpropagatable_graph(dev, tgt):
    R""" The network is as following:
            a     b  c     d
             \   /    \   /
              add      mul
                \      /
                subtract
    """

    a = relay.var("a", shape=(10, 10))
    b = relay.var("b", shape=(10, 10))
    c = relay.var("c", shape=(10, 10))
    d = relay.var("d", shape=(10, 10))
    a_data = np.random.rand(10, 10).astype('float32')
    b_data = np.random.rand(10, 10).astype('float32')
    c_data = np.random.rand(10, 10).astype('float32')
    d_data = np.random.rand(10, 10).astype('float32')
    tmp_add = a_data + b_data
    tmp_mul = np.multiply(c_data, d_data)
    ref_res = np.subtract(tmp_add, tmp_mul)

    fallback_device = tvm.context("cpu")
    target = {"cpu": "llvm", dev: tgt}
    cpu_ctx = fallback_device
    dev_ctx = tvm.context(dev)

    def annotated():
        add = relay.add(a, b)
        _add = relay.annotation.on_device(add, dev_ctx)
        mul = relay.multiply(c, d)
        _mul = relay.annotation.on_device(mul, cpu_ctx)
        sub = relay.subtract(_add, _mul)
        _sub = relay.annotation.on_device(sub, dev_ctx)
        func = relay.Function([a, b, c, d], _sub)
        func = run_opt_pass(func,
                            transform.RewriteAnnotatedOps(dev_ctx.device_type))
        return func

    def expected():
        add = relay.add(a, b)
        mul = relay.multiply(c, d)
        copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx)
        sub = relay.subtract(add, copy_mul_sub)
        func = relay.Function([a, b, c, d], sub)
        return func

    annotated_func = annotated()
    expected_func = expected()
    expected_index = [2, 2, 2, 1, 1, 1, 2, 2]
    check_annotated_graph(annotated_func, expected_func)
    params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data}
    with tvm.transform.PassContext(
            opt_level=0,
            config={"relay.fallback_device_type":
                    fallback_device.device_type}):
        graph, lib, params = relay.build(annotated_func, target, params=params)
        contexts = [tvm.cpu(0), tvm.context(dev)]
        graph_json = json.loads(graph)
        if "device_index" in graph_json["attrs"]:
            device_index = graph_json["attrs"]["device_index"][1]
            assert device_index == expected_index
        mod = graph_runtime.create(graph, lib, contexts)
        mod.set_input(**params)
        mod.run()
        res = mod.get_output(0).asnumpy()
        tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
Example #58
0
image = Image.open(img_name).resize((224, 224))

def transform_image(image):
    image = np.array(image) - np.array([123., 117., 104.])
    image /= np.array([58.395, 57.12, 57.375])
    image = image.transpose((2, 0, 1))
    image = image[np.newaxis, :]
    return image

x = transform_image(image)


ctx = tvm.cpu()
loaded_graph = open("deploy_graph.json").read()
loaded_lib = tvm.module.load("./net.tar")
loaded_params = bytearray(open("deploy_param.params", "rb").read())
input_data = tvm.nd.array(x.astype('float32'))


# create the remote runtime module
module = runtime.create(loaded_graph, loaded_lib, ctx)
# set parameter (upload params to the remote device. This may take a while)
module.load_params(loaded_params)
# run
module.run(data=input_data)
# get output
out = module.get_output(0)
# get top1 result
top1 = np.argmax(out.asnumpy())
print('TVM prediction top-1: {}'.format(synset[top1]))
Example #59
0
def main():
    # extract workloads from relay program
    input_shape = (1, 3, 224, 224)
    print("Extrack tasks...")
    mod, params = get_workload(image_shape=input_shape[1:],
                               batch_size=input_shape[0])

    tasks = autotvm.task.extract_from_program(mod["main"],
                                              target=target,
                                              target_host=target_host,
                                              params=params,
                                              ops=(
                                                  relay.op.nn.conv2d,
                                                  relay.op.nn.dense,
                                              ))

    # run tuning tasks
    print("Tuning...")

    tune_tasks(tasks, **tuning_option)

    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=0):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params, target_host=target_host)

            tmp = tempdir()
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

            remote = autotvm.measure.request_remote(device_key,
                                                    '0.0.0.0',
                                                    9192,
                                                    timeout=10000)
            remote.upload(tmp.relpath(filename))
            rlib = remote.load_module(filename)

            ctx = remote.context(str(target), 0)
            module = runtime.create(graph, rlib, ctx)
            data_tvm = tvm.nd.array(
                (np.random.uniform(size=input_shape)).astype(dtype))

            print("Run...")
            print("Set_input(\"data\")")
            module.set_input('data', data_tvm)
            print("Set_input(**param)")
            module.set_input(**params)

            #evaluate
            print("Evaluate inference time cost...")
            ftimer = module.module.time_evaluator("run",
                                                  ctx,
                                                  number=1,
                                                  repeat=600)
            prof_res = np.array(ftimer().results) * 1000
            #print(ftimer().results)
            tmp = sorted(ftimer().results)
            print(tmp[0])
            print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
                  (np.mean(prof_res), np.std(prof_res)))
def run_case(model, dtype):
    # load model
    if model == 'vgg16':
        net, params = nnvm.testing.vgg.get_workload(num_layers=16,
                                                    batch_size=1,
                                                    image_shape=image_shape,
                                                    dtype=dtype)
    elif model == 'resnet18':
        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
                                                       batch_size=1,
                                                       image_shape=image_shape,
                                                       dtype=dtype)
    elif model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape, dtype=dtype)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(model))

    # compile
    opt_level = 2 if dtype == 'float32' else 1
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(net,
                                                 tvm.target.mali(),
                                                 shape={"data": data_shape},
                                                 params=params,
                                                 dtype=dtype,
                                                 target_host=args.target_host)

    # upload model to remote device
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    if args.host is not None:
        remote = rpc.connect(args.host, args.port)
        remote.upload(lib_fname)

        ctx = remote.cl(0)
        rlib = remote.load_module('net.tar')
        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
    else:
        ctx = tvm.cl(0)
        rlib = lib
        rparams = params

    # create graph runtime
    module = runtime.create(graph, rlib, ctx)
    module.set_input(
        'data',
        tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype)))
    module.set_input(**rparams)

    # benchmark
    # print("============================================================")
    # print("model: %s, dtype: %s" % (model, dtype))

    # the num of runs for warm up and test
    num_warmup = 10
    num_test = 60
    if model == 'mobilenet':  # mobilenet is fast, need more runs for stable measureament
        num_warmup *= 5
        num_test *= 5

    # perform some warm up runs
    # print("warm up..")
    warm_up_timer = module.module.time_evaluator("run", ctx, num_warmup)
    warm_up_timer()

    # test
    # print("test..")
    ftimer = module.module.time_evaluator("run", ctx, num_test)
    prof_res = ftimer()
    # print("cost per image: %.4fs" % prof_res.mean)

    print("backend: TVM-mali\tmodel: %s\tdtype: %s\tcost:%.4f" %
          (model, dtype, prof_res.mean))