def test_batch_norm(): input_shape = (1, 4, 4, 16) target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) inputs1 = nnvm.symbol.Variable("inputs1") inputs2 = nnvm.symbol.Variable("inputs2") z1 = nnvm.symbol.relu(inputs1) # z2 = nnvm.symbol.relu(z1) compute_graph = nnvm.graph.create(z1) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target, shape = {"inputs1" : input_shape}, dtype = "float32", target_host = target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type = 'S0') deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target, shape = {"inputs1" : input_shape}, dtype = "float32", target_host = target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size = (1, 4, 4, 16), low = -32, high = 32).astype(np.float32) b_np = np.random.uniform(size = (1, 16), low = -32, high = 32).astype(np.float32) print(a_np) module.set_input(inputs1 = a_np) module.run() out = module.get_output(0, out = tvm.nd.empty((1, 4, 4, 16))) print(out.asnumpy) print(compute_graph.ir()) print(deploy_graph.ir())
def test_dense(): shape = (16, 1024) weight_shape = (256, 1024) bias_shape = (256, ) inputs = nnvm.symbol.Variable("inputs") weights = nnvm.symbol.Variable("weights") bias = nnvm.symbol.Variable("bias") env = nnpu.get_env() target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) z = nnvm.symbol.dense(data=inputs, weight=weights, use_bias=0, units=256) z1 = nnvm.symbol.relu(z) compute_graph = nnvm.graph.create(z1) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs": shape, "weights": weight_shape }, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='SC') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs": shape, "weights": weight_shape }, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=shape) b_np = np.random.random(size=weight_shape) m.set_input(**{"inputs": a_np, "weights": b_np}) m.run() gt = a_np.dot(b_np.transpose()) out = m.get_output(0, out=tvm.nd.empty((16, 256))) np.testing.assert_allclose(out.asnumpy(), gt, rtol=5e-5) print("tests") print(out) print(compute_graph.ir()) print(deploy_graph.ir())
def test_conv2d(): input_shape = (1, 16, 10, 64) target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) inputs = nnvm.symbol.Variable("inputs") inputs1 = nnvm.symbol.Variable("inputs1") z1 = nnvm.symbol.conv2d(data=inputs, channels=64, kernel_size=(3, 3), padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI') z2 = nnvm.symbol.sigmoid(z1) z = nnvm.symbol.elemwise_add(z2, inputs1) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs": input_shape, "inputs1": (1, 14, 8, 64) }, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='SC') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=input_shape, low=-32, high=32).astype(np.float32) b_np = np.random.uniform(size=(1, 14, 8, 64), low=-32, high=32).astype(np.float32) module.set_input(inputs=a_np) module.run() print(deploy_graph.ir()) out = module.get_output(0, out=tvm.nd.empty((1, 14, 8, 64)))
def test_elemwise_mul(): env = nnpu.get_env() device = "nnpu" target_host = "llvm" target = tvm.target.create("llvm -device={}".format(device)) inputs1 = nnvm.symbol.Variable("inputs1") inputs2 = nnvm.symbol.Variable("inputs2") shape = (16, 6, 16) z = nnvm.symbol.elemwise_mul(inputs1, inputs2) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs1": shape, "inputs2": shape }, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={ "inputs1": shape, "inputs2": shape }, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random((16, 6, 16)) b_np = np.random.random((16, 6, 16)) print("a_np : ") print(a_np) print("b_np : ") print(b_np) m.set_input(**{"inputs1": a_np, "inputs2": b_np}) gt = (a_np.astype("float32") * b_np.astype("float32")).astype("float32") m.run() out = m.get_output(0, out=tvm.nd.empty((16, 6, 16))) np.testing.assert_allclose(out.asnumpy(), gt) print("elemwise_mul tests success") print(out)
def test_max_pool2d(): device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) target_host = "llvm" inputs = nnvm.symbol.Variable("inputs") shape = (1, 224, 224, 16) kernels = nnvm.symbol.Variable("kernels") kernel_shape = (2, 2) z = nnvm.symbol.avg_pool2d(inputs, pool_size=(2, 2), strides=(1, 1), layout="NHWC") compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32") else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32") ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=(1, 224, 224, 16)) m.set_input(**{"inputs": a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty((1, 223, 223, 16))) gt = avg_pooling((1, 224, 224, 16), (1, 223, 223, 16), (2, 2), a_np, (1, 1), "float32") np.testing.assert_allclose(out.asnumpy(), gt, rtol=5e-7) print("max_pool2d tests success") print(gt) print(out) print("end")
def test_log(): env = nnpu.get_env() shape = (1, 22, 22, 16) device = "nnpu" target_host = "llvm" target = tvm.target.create("llvm -device={}".format(device)) inputs = nnvm.symbol.Variable("inputs") z = nnvm.symbol.log(inputs) z1 = nnvm.symbol.exp(z) compute_graph = nnvm.graph.create(z1) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(shape) print(a_np) m.set_input(**{"inputs": a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty(shape)) gt = np.exp(np.log( a_np.astype("float32")).astype("float32")).astype("float32") print(out) np.testing.assert_allclose(out.asnumpy(), gt) print("log tests success") print(compute_graph.ir()) print(deploy_graph.ir())
def test_relu(): shape = (2, 16) inputs = nnvm.symbol.Variable("inputs") env = nnpu.get_env() target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) z = nnvm.symbol.relu(inputs) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=(2, 16)).astype("float32") - 0.5 m.set_input(**{'inputs': a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty((2, 16))) print(a_np) print(out.dtype) print(out) np.testing.assert_allclose(out.asnumpy(), np.maximum(a_np, 0)) print("tests") print(compute_graph.ir()) print(deploy_graph.ir())
def test_onemore(): shape = (1, 32, 32, 16) inputs = nnvm.symbol.Variable("inputs") env = nnpu.get_env() target_host = "llvm" device = "nnpu" target = tvm.target.create("llvm -device={}".format(device)) z1 = nnvm.symbol.relu(inputs) z = nnvm.symbol.sqrt(z1) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"inputs": shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) m = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=(1, 32, 32, 16)) m.set_input(**{'inputs': a_np}) m.run() out = m.get_output(0, out=tvm.nd.empty((1, 32, 32, 16))) print(out) print(compute_graph.ir()) print(deploy_graph.ir())
def test_vgg(): def get_feature(internel_layer, layers, filters, batch_norm=False): """ Get VGG feature body as stacks of convoltions. layers : [1, 1, 2, 2, 2] filters : [64, 128, 256, 512, 512] """ for i, num in enumerate(layers): """ i = 0, num = 1 i = 1, num = 1 i = 2, num = 2 i = 3, num = 2 i = 4, num = 2 """ for j in range(num): internel_layer = sym.pad(data=internel_layer, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) internel_layer = sym.conv2d(data=internel_layer, kernel_size=(3, 3), channels=filters[i], layout='NHWC', kernel_layout='HWOI', name="conv%s_%s" % (i + 1, j + 1)) if batch_norm: internel_layer = sym.batch_norm(data=internel_layer, axis=3, name="bn%s_%s" % (i + 1, j + 1)) internel_layer = sym.relu(data=internel_layer, name="relu%s_%s" % (i + 1, j + 1)) internel_layer = sym.max_pool2d(data=internel_layer, pool_size=(2, 2), strides=(2, 2), layout="NHWC", name="pool%s" % (i + 1)) return internel_layer def get_classifier(input_data, num_classes): """ Get VGG classifier layers as fc layers. """ flatten = sym.flatten(data=input_data, name="flatten") fc1 = sym.dense(data=flatten, units=32, name="fc1") relu1 = sym.relu(data=fc1, name="relu1") drop1 = sym.dropout(data=relu1, rate=0.5, name="drop1") fc2 = sym.dense(data=drop1, units=32, name="fc2") relu2 = sym.relu(data=fc2, name="relu2") drop2 = sym.dropout(data=relu2, rate=0.5, name="drop2") fc3 = sym.dense(data=drop2, units=num_classes, name="fc3") return fc3 def get_symbol(datas, num_classes, num_layers=11, batch_norm=False): """ Parameters ------------ num_classes : int, default 16 Number of classification classes num_layers : int Number of layers for the variant of vgg. Options are 11, 13, 16, 19 batch_norm : bool, default False Use batch normalization. """ vgg_spec = { 11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]), 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]), 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512]) } if num_layers not in vgg_spec: raise ValueError( "Invalide num_layers {}. Choices are 11, 13, 16, 19.".format( num_layers)) layers, filters = vgg_spec[num_layers] feature = get_feature(datas, layers, filters, batch_norm) classifier = get_classifier(feature, num_classes) symbol = sym.softmax(data=classifier, name="softmax") return symbol input_shape = (1, 224, 224, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 1 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) print(compute_graph.ir()) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: nnpu.set_device(nnpu.get_env(), type='SC') with ScheduleProcHelper(): with nnpu.build_config(): deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=input_shape, low=-32, high=32).astype(np.float32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) # module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)
def test_inception_v3(): def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''): if pad[0] != 0 or pad[1] != 0: data = sym.pad(data=data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel, strides=stride, padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name="%s%s_conv2d" % (name, suffix)) bn = sym.batch_norm(data=conv, name="%s%s_batchnorm" % (name, suffix), epsilon=2e-5, axis=3) act = sym.relu(data=bn, name="%s%s_relu" % (name, suffix)) return act def Pooling(data, kernel, stride, pad, pool_type, name): if pad[0] != 0 or pad[1] != 0: data = sym.pad(data=data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) if pool_type == 'max': return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, name=name, layout='NHWC') if pool_type == 'avg': return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, name=name, layout='NHWC') raise ValueError("Invalid pooling type: " + pool_type) def Inception7A(data, num_1x1, num_3x3_red, num_3x3_1, num_3x3_2, num_5x5_red, num_5x5, pool, proj, name): # num_1x1 = 64 # num_3x3_red = 64 # num_3x3_1 = 96 # num_3x3_2 = 96 # num_5x5_red = 48 # num_5x5 = 64 tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name)) tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv') tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1') tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix="_conv") tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2') pooling = Pooling(data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv') concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7B(data, num_3x3, num_d3x3_red, num_d3x3_1, num_d3x3_2, pool, name): tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name)) tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv') tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1') tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2') pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max", name=('max_pool_%s_pool' % name)) concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7C(data, num_1x1, num_d7_red, num_d7_1, num_d7_2, num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4, pool, proj, name): tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv') tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1') tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2') tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3') tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4') pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7D(data, num_3x3_red, num_3x3, num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, pool, name): tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv') tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1') tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv') tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1') tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2') tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3') pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0), name=('%s_pool_%s_pool' % (pool, name))) concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling], axis=3, name='ch_concat_%s_chconcat' % name) return concat def Inception7E(data, num_1x1, num_d3_red, num_d3_1, num_d3_2, num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, pool, proj, name): tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv') tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv') tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1') tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv') tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv') tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1') pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') concat = sym.concatenate(*[ tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj ], axis=3, name='ch_concat_%s_chconcat' % name) return concat def get_symbol(data, num_classes=16, **kwargs): conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv") conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1") conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2") pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), name="pool") conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3") conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4") pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), name="pool1") in3a = Inception7A(pool1, 64, 64, 96, 96, 48, 64, "avg", 32, "mixed") in3b = Inception7A(in3a, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_1") in3c = Inception7A(in3b, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_2") in3d = Inception7B(in3c, 384, 64, 96, 96, "max", "mixed_3") in4a = Inception7C(in3d, 192, 128, 128, 192, 128, 128, 128, 128, 192, "avg", 192, "mixed_4") in4b = Inception7C(in4a, 192, 160, 160, 192, 160, 160, 160, 160, 192, "avg", 192, "mixed_5") in4c = Inception7C(in4b, 192, 160, 160, 192, 160, 160, 160, 160, 192, "avg", 192, "mixed_6") in4d = Inception7C(in4c, 192, 192, 192, 192, 192, 192, 192, 192, 192, "avg", 192, "mixed_7") in4e = Inception7D(in4d, 192, 320, 192, 192, 192, 192, "max", "mixed_8") in5a = Inception7E(in4e, 320, 384, 384, 384, 448, 384, 384, 384, "avg", 192, "mixed_9") in5b = Inception7E(in5a, 320, 384, 384, 384, 448, 384, 384, 384, "max", 192, "mixed_10") pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0), name="global_pool") flatten = sym.flatten(data=pool, name="flatten") fc1 = sym.dense(data=flatten, units=num_classes, name="fc1") softmax = sym.softmax(data=fc1, name="softmax") return softmax input_shape = (1, 299, 299, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 3 z = get_symbol(data=data, num_classes=16) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=1): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=(1, 299, 299, 16), low=-32, high=32).astype(np.float32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)
# we can move gemm into acc2buffer copy. xo, xi = s[out_buf].split(out_buf.op.axis[0], factor=gemm_shape[0]) s[prod_buf].compute_at(s[out_buf], xo) s[out_buf].pragma(xi, env.copy_acc2buf) # split and tensorize VAddV. nvctr_unit = env.cfg['vector_unit']['size'] xo, xi = s[res_buf].split(res_buf.op.axis[0], factor=nvctr_unit) s[res_buf].tensorize(xi, env.intrins.get('VAddV', mode='w')) #==================================# # ------ this ends the scheduling ------ #==================================# # with nnpu.build_config(dump_pass_ir=True): with nnpu.build_config(): print(nnpu.lower(s, [weight, data, bias, res_host], simple_mode=True)) func = nnpu.build(s, [weight, data, bias, res_host], 'nnpu', 'llvm', name='nnpu_func') print('------------------- device module 1 TVM IR: ') print(func.imported_modules[0].get_source('ir')) print('------------------- device module 1 uop: ') print(func.imported_modules[0].get_source('uop')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=weight_shape, dtype=weight.dtype, low=-32,
def test_densenet(): def Conv(datas, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)): if pad[0] != 0 or pad[1] != 0: datas = nnvm.symbol.pad(data=datas, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) conv = nnvm.symbol.conv2d(data=datas, kernel_size=kernel_size, channels=filter_nums, strides=stride, layout='NHWC', kernel_layout='HWOI') return conv def bottleneck_layer(datas, filters): bn1 = nnvm.symbol.batch_norm(data=datas, epsilon=2e-5, axis=3) relu1 = nnvm.symbol.relu(data=bn1) conv1 = Conv(datas=relu1, kernel_size=(1, 1), filter_nums=4 * filters) bn2 = nnvm.symbol.batch_norm(data=conv1, epsilon=2e-5, axis=3) relu2 = nnvm.symbol.relu(data=bn2) conv2 = Conv(datas=relu2, kernel_size=(3, 3), filter_nums=filters, pad=(1, 1)) return conv2 def transition_layer(datas, filters): conv = Conv(datas=datas, kernel_size=(1, 1), filter_nums=filters) pool = nnvm.symbol.avg_pool2d(data=conv, pool_size=(2, 2), strides=(2, 2), layout='NHWC') return pool def dense_block(datas, filters, layers): layers_concat = [] layers_concat.append(datas) b_l = bottleneck_layer(datas, filters) layers_concat.append(b_l) for i in range(layers - 1): x = nnvm.symbol.concatenate(*layers_concat, axis=3) x = bottleneck_layer(x, filters) layers_concat.append(x) return x def get_symbol(datas, num_classes=16): x = Conv(datas, kernel_size=(7, 7), filter_nums=32, stride=(2, 2)) x = nnvm.symbol.max_pool2d(x, pool_size=(3, 3), strides=(2, 2), layout='NHWC') b1 = dense_block(x, 32, 6) l1 = transition_layer(b1, 32) b2 = dense_block(l1, 32, 12) l2 = transition_layer(b2, 32) b3 = dense_block(l2, 32, 48) l3 = transition_layer(b3, 32) b4 = dense_block(l3, 32, 32) x = nnvm.symbol.global_avg_pool2d(data=b4, layout='NHWC') x = nnvm.symbol.flatten(data=x) fc = nnvm.symbol.dense(data=x, units=16) symbol = nnvm.symbol.softmax(data=fc) return symbol input_shape = (1, 229, 229, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 3 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.random(size=input_shape) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean)
def test_Alexnet(): def Conv(data, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)): if pad[0] != 0 or pad[1] != 0: data = nnvm.symbol.pad(data=data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1]), (0, 0))) datas = nnvm.symbol.conv2d(data=data, kernel_size=kernel_size, channels=filter_nums, strides=stride, layout='NHWC', kernel_layout='HWOI') datas = nnvm.symbol.relu(data=datas) return datas def get_symbol(datas, num_classes): conv1 = Conv(data=datas, kernel_size=(11, 11), filter_nums=96, stride=(4, 4)) pool1 = nnvm.symbol.max_pool2d(data=conv1, pool_size=(3, 3), strides=(2, 2), layout='NHWC') conv2 = Conv(data=pool1, kernel_size=(5, 5), filter_nums=256, pad=(2, 2)) pool2 = nnvm.symbol.max_pool2d(data=conv2, pool_size=(3, 3), strides=(2, 2), layout='NHWC') conv3 = Conv(data=pool2, kernel_size=(3, 3), filter_nums=384, pad=(1, 1)) conv4 = Conv(data=conv3, kernel_size=(3, 3), filter_nums=384, pad=(1, 1)) conv5 = Conv(data=conv4, kernel_size=(3, 3), filter_nums=256, pad=(1, 1)) pool3 = nnvm.symbol.max_pool2d(data=conv5, pool_size=(3, 3), strides=(2, 2), layout='NHWC') datas = nnvm.symbol.flatten(data=pool3) fc1 = nnvm.symbol.dense(data=datas, units=1024) relu1 = nnvm.symbol.relu(data=fc1) drop1 = nnvm.symbol.dropout(data=relu1, rate=0.5) fc2 = nnvm.symbol.dense(data=drop1, units=1024) relu2 = nnvm.symbol.relu(data=fc2) drop2 = nnvm.symbol.dropout(data=relu2, rate=0.5) fc3 = nnvm.symbol.dense(data=drop2, units=16) symbol = nnvm.symbol.softmax(fc3) return symbol input_shape = (1, 128, 128, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 1 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) print(compute_graph.ir()) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='SC') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.randint(size=input_shape, low=-32, high=32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) # module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)
def test_resnets(): def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True): # bottle_neck : False """ Return Resnet Unit symbol for building Resnet Parameters ------------ data : str Input data num_filter : int Number of output channels stride : tuple Stride used in convolution dim_match : Boolean True means channel number between input and output is the same otherwise means differ """ if bottle_neck: bn1 = nnvm.symbol.batch_norm(data=data, axis=3, epsilon=2e-5, name=name + '_bn1') act1 = nnvm.symbol.relu(data=bn1, name=name + '_relu1') conv1 = nnvm.symbol.conv2d(data=act1, channels=int(num_filter * 0.25), kernel_size=(1, 1), strides=stride, padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_conv1') bn2 = nnvm.symbol.batch_norm(data=conv1, axis=3, epsilon=2e-5, name=name + '_bn2') act2 = nnvm.symbol.relu(data=bn2, name=name + '_relu2') pad = nnvm.symbol.pad(data=act2, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) conv2 = nnvm.symbol.conv2d(data=pad, channels=int(num_filter * 0.25), kernel_size=(3, 3), strides=(1, 1), padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_conv2') bn3 = nnvm.symbol.batch_norm(data=conv2, axis=3, epsilon=2e-5, name=name + '_bn3') act3 = nnvm.symbol.relu(data=bn3, name=name + '_relu3') conv3 = nnvm.symbol.conv2d(data=act3, channels=num_filter, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_conv3') if dim_match: shortcut = data else: shortcut = nnvm.symbol.conv2d(data=act1, channels=num_filter, kernel_size=(1, 1), strides=stride, use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_sc') return nnvm.symbol.elemwise_add(conv3, shortcut) else: # bottle_neck = False # i = 0 : filter_list[1] = 64, (1, 1), False # i = 1 : filter_list[2] = 128, (2, 2), False # i = 2 : filter_list[3] = 256, (2, 2), False # i = 3 : filter_list[4] = 512, (2, 2), False # bn1 = nnvm.symbol.batch_norm(data = data, axis = 3, epsilon = 2e-5, name = name + '_bn1') act1 = nnvm.symbol.relu(data=data, name=name + '_relu1') # (56, 56, 64) # num_filter = filter_list[1] = 64 # strides = (1, 1) pad1 = nnvm.symbol.pad(data=act1, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) conv1 = nnvm.symbol.conv2d(data=pad1, channels=num_filter, kernel_size=(3, 3), strides=stride, padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_bn2') # i = 0 : (56, 56, 64) # i = 1 : (28, 28, 128) # i = 2 : (14, 14, 256) # i = 3 : (7, 7, 512) # bn2 = nnvm.symbol.batch_norm(data = conv1, axis = 3, epsilon = 2e-5, name = name + '_bn2') act2 = nnvm.symbol.relu(data=conv1, name=name + '_relu2') # (56, 56, 64) pad2 = nnvm.symbol.pad(data=act2, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) conv2 = nnvm.symbol.conv2d(data=pad2, channels=num_filter, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_conv2') # i = 0 : (56, 56, 64) # i = 1 : (28, 28, 128) # i = 2 : (14, 14, 256) if dim_match: shortcut = data else: shortcut = nnvm.symbol.conv2d(data=act1, channels=num_filter, kernel_size=(1, 1), strides=stride, use_bias=False, layout='NHWC', kernel_layout='HWOI', name=name + '_sc') return nnvm.symbol.elemwise_add(conv2, shortcut) def resnet(datas, units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True): # units = [2, 2, 2, 2] # num_stages = 4 # filter_list = [64, 64, 128, 256, 512] # num_classes = 1000 # image_shape = (224, 224, 16) # bottle_neck = False """ Return Resnet symbol of Parameters ------------ units : list Number of units in each stage num_stage : int Number of stage filter_list : list Channel size of each stage num_classes : int Output size of symbol """ num_unit = len(units) assert num_unit == num_stages data = nnvm.symbol.batch_norm(data=datas, axis=3, epsilon=2e-5, scale=False, name="bn_data") (_, height, _, _) = image_shape if height <= 32: pad = nnvm.symbol.pad(data=data, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) body = nnvm.symbol.conv2d(data=pad, channels=filter_list[0], kernel_size=(3, 3), strides=(1, 1), padding=(1, 1), use_bias=False, layout='NHWC', kernel_layout='HWOI', name="conv0") else: pad = nnvm.symbol.pad(data=data, pad_width=((0, 0), (3, 3), (3, 3), (0, 0))) body = nnvm.symbol.conv2d(data=pad, channels=filter_list[0], kernel_size=(7, 7), strides=(2, 2), padding=(0, 0), use_bias=False, layout='NHWC', kernel_layout='HWOI', name="conv0") # body.shape = (112, 112, 64) # body = nnvm.symbol.batch_norm(data = body, axis = 3, epsilon = 2e-5, name = "bn0") body = nnvm.symbol.relu(data=body, name="relu0") # body = nnvm.symbol.pad(data = body, pad_width = ((0, 0), (1, 1), (1, 1), (0, 0))) body = nnvm.symbol.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), layout='NHWC') # body.shape = (56, 56, 64) for i in range(num_stages): # num_stages == 4 # i = 0: (56, 56, 64) # i = 1: filter_list[2] = 128, (2, 2), False (28, 28, 128) # i = 2: filter_list[3] = 256, (2, 2), False # i = 3: filter_list[4] = 512, (2, 2), False body = residual_unit(body, filter_list[i + 1], (1 if i == 0 else 2, 1 if i == 0 else 2), False, name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck) # (56, 56, 64) # units[0] - 1 = 1 for j in range(units[i] - 1): body = residual_unit(body, filter_list[i + 1], (1, 1), True, name="stage%d_unit%d" % (i + 1, j + 2), bottle_neck=bottle_neck) # (56, 56, 64) # (7, 7, 512) # bn1 = nnvm.symbol.batch_norm(data = body, axis = 3, epsilon = 2e-5, name = "bn1") relu1 = nnvm.symbol.relu(data=body, name="relu1") pool1 = nnvm.symbol.global_avg_pool2d(data=relu1, layout='NHWC', name="pool1") # (1, 1, 512) flat = nnvm.symbol.flatten(data=pool1) # (512) fc1 = nnvm.symbol.dense(data=flat, units=num_classes, name='fc1') return nnvm.symbol.softmax(data=fc1, name='softmax') def get_symbol(datas, num_classes, num_layers=50, image_shape=(1, 224, 224, 16), **kwargs): (_, height, _, _) = image_shape if height <= 28: num_stages = 3 if (num_layers - 2) % 9 == 0 and num_layers >= 164: per_unit = [(num_layers - 2) // 9] filter_list = [16, 64, 128, 256] bottle_neck = True elif (num_layers - 2) % 6 == 0 and num_layers < 164: per_unit = [(num_layers - 2) // 6] filter_list = [16, 16, 32, 64] bottle_neck = False else: raise ValueError( "no experiments done on num_layers {}".format(num_layers)) units = per_unit * num_stages else: print("height = 224 > 28") # height = 224 > 28 if num_layers >= 50: filter_list = [64, 256, 512, 1024, 2048] bottle_neck = True else: print("num_layers = 18 < 50") # num_layers = 18 < 50 filter_list = [64, 64, 128, 256, 512] bottle_neck = False num_stages = 4 if num_layers == 18: units = [2, 2, 2, 2] elif num_layers == 34: units = [3, 4, 6, 3] elif num_layers == 50: units = [3, 4, 6, 3] elif num_layers == 101: units = [3, 4, 23, 3] elif num_layers == 152: units = [3, 8, 36, 3] elif num_layers == 200: units = [3, 24, 36, 3] elif num_layers == 269: units = [3, 30, 48, 8] else: raise ValueError( "no experiments done on num_layers {}".format(num_layers)) return resnet(datas=datas, units=units, num_stages=num_stages, filter_list=filter_list, num_classes=num_classes, image_shape=image_shape, bottle_neck=bottle_neck) input_shape = (1, 224, 224, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 3 z = get_symbol(datas=data, num_classes=16, num_layers=18, image_shape=(1, 224, 224, 16)) compute_graph = nnvm.graph.create(z) print(compute_graph.ir()) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: with ScheduleProcHelper(): with nnpu.build_config(): nnpu.set_device(nnpu.get_env(), type='S0') deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) print(deploy_graph.ir()) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=(1, 224, 224, 16), low=-32, high=32).astype(np.float32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)