Beispiel #1
0
def test():
    env = nnpu.get_env()
    shape = (8, 16)
    a = tvm.placeholder(shape, env.cfg['dtype_n'], 'a')
    b = tvm.placeholder(shape, env.cfg['dtype_n'], 'b')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    dtype_w = env.cfg['dtype_w']

    k = tvm.reduce_axis((0, 16), 'k')
    dot_buf = tvm.compute((8, ), lambda i: tvm.sum(
        a_buf[i, k].astype(dtype_w) * b_buf[i, k].astype(dtype_w), k),
                          'dot_buf')
    sph.MarkScope(dot_buf)
    dot_host, dot_dram = nnpu.utils.CopyBufToH(dot_buf, 'sum', sph)

    s = tvm.create_schedule(dot_host.op)
    sph.Transform(s)

    s[dot_buf].tensorize(s[dot_buf].op.axis[0],
                         env.intrins.get('MRowDot', shape=shape, mode='inc'))

    print(nnpu.lower(s, [a, b, dot_host], simple_mode=True))
    func = nnpu.build(s, [a, b, dot_host], 'nnpu', 'llvm', name='nnpu_func')

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ll'))

    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(8, 16), dtype=a.dtype, low=-32, high=32)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=(8, 16), dtype=b.dtype, low=-32, high=32)
    b_nd = tvm.nd.array(b_np, ctx)
    c_nd = tvm.nd.array(np.zeros((8, )).astype(dot_host.dtype), ctx)

    func(a_nd, b_nd, c_nd)
    #print('a = ')
    #print(a_np)
    #print('b = ')
    #print(b_np)

    print(c_nd.asnumpy())
    print('ground truth is')
    gt = np.multiply(a_np, b_np, dtype=dot_host.dtype)
    gt = np.sum(gt, axis=1)
    print(gt)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)
Beispiel #2
0
def test():
    env = nnpu.get_env()

    a = tvm.placeholder((4, 16), env.cfg['dtype_w'], 'a')
    
    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    k = tvm.reduce_axis((0, 16), 'k')
    c_buf = tvm.compute((4, 1), lambda i, j: tvm.sum(a_buf[i,k], axis=k), 'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    k1 = tvm.reduce_axis((0, 16), 'k1')
    max_buf = tvm.compute((4, 1), lambda i, j: tvm.max(a_buf[i,k1], axis=k1), 'max_buf')
    sph.MarkScope(max_buf)
    max_host, max_dram = nnpu.utils.CopyBufToH(max_buf, 'max', sph)

    k2 = tvm.reduce_axis((0, 16), 'k2')
    min_buf = tvm.compute((4, 1), lambda i, j: tvm.min(a_buf[i,k2], axis=k2), 'min_buf')
    sph.MarkScope(min_buf)
    min_host, min_dram = nnpu.utils.CopyBufToH(min_buf, 'min', sph)

    # create schedule and tensorize
    s = tvm.create_schedule([c_host.op, max_host.op, min_host.op])
    sph.Transform(s)
    s[c_buf].tensorize(s[c_buf].op.axis[1], env.intrins.get('VReduceSum', mode='w'))
    s[max_buf].tensorize(s[max_buf].op.axis[1], env.intrins.get('VReduceMax', mode='w'))
    s[min_buf].tensorize(s[min_buf].op.axis[1], env.intrins.get('VReduceMin', mode='w'))

    # build
    print(nnpu.lower(s, [a, c_host, max_host, min_host], simple_mode=True))
    func = nnpu.build(s, [a, c_host, max_host, min_host], 'nnpu', 'llvm', name='nnpu_func')

    # create data and run

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low = 0, high = 64)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    c_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx)
    max_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx)
    min_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx)

    func(a_nd, c_nd, max_nd, min_nd)

    # check results
    gt = np.sum(a_np, axis=(1,), keepdims=True)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)

    np.testing.assert_allclose(max_nd.asnumpy(), np.max(a_np, axis=(1,), keepdims=True))

    np.testing.assert_allclose(min_nd.asnumpy(), np.min(a_np, axis=(1,), keepdims=True))
    print('test passed')
Beispiel #3
0
def test():
    env = nnpu.get_env()
    shape = (16, 16)
    flatten_shape = (shape[0] * shape[1],)
    a = tvm.placeholder(flatten_shape, env.cfg['dtype_n'], 'a')
    b = tvm.placeholder(flatten_shape, env.cfg['dtype_n'], 'b')
    
    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)
    
    sum_buf = tvm.compute(flatten_shape, lambda i: a_buf[i] + b_buf[i], 'sum_buf')
    sph.MarkScope(sum_buf)
    sum_host, sum_dram = nnpu.utils.CopyBufToH(sum_buf, 'sum', sph)

    s = tvm.create_schedule([sum_host.op])
    sph.Transform(s)

    xo, xi = s[sum_buf].split(sum_buf.op.axis[0], 16)
    s[sum_buf].tensorize(xo, env.intrins.get('MAddM', shape=shape, mode='n'))

    print(nnpu.lower(s, [a, b, sum_host], simple_mode=True))
    func = nnpu.build(s, [a, b, sum_host], 'nnpu', 'llvm', name='nnpu_exp')

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ll'))

    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=flatten_shape, dtype=a.dtype, low = 0, high = 23)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=flatten_shape, dtype=b.dtype, low = 0, high = 23)    
    b_nd = tvm.nd.array(b_np, ctx)
    c_nd = tvm.nd.array(np.zeros(flatten_shape).astype(sum_host.dtype), ctx)

    func(a_nd, b_nd, c_nd)
    print('a = ')
    print(a_np)
    print('b = ')
    print(b_np)
    print('a + b = ')
    print(c_nd.asnumpy())
    print("numpy ground truth is")
    print(a_np + b_np)
    np.testing.assert_allclose(c_nd.asnumpy(), a_np + b_np)
Beispiel #4
0
def test():
    env = nnpu.get_env()
    a = tvm.placeholder((16,16), env.cfg['dtype_n'], 'a')
    sph = ScheduleProcHelper()
    Imm = tvm.const(7, env.cfg['dtype_n'])
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    add_buf = tvm.compute((16,16), lambda i,j: Imm+a_buf[i][j], 'add_buf')
    sph.MarkScope(add_buf)
    add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph)

    dtype_w = env.cfg['dtype_w']
    mul_buf = tvm.compute((16,16), lambda i,j: a_buf[i][j].astype(dtype_w) * Imm.astype(dtype_w), 
                          'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    rsub_buf = tvm.compute((16,16), lambda i,j: Imm-a_buf[i][j], 'rsub_buf')
    sph.MarkScope(rsub_buf)
    rsub_host, rsub_dram = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph)

    s = tvm.create_schedule([add_host.op,mul_host.op,rsub_host.op])
    sph.Transform(s)
    s[add_buf].tensorize(s[add_buf].op.axis[0], env.intrins.get('MAddI', 
                            shape=(16,16), imm_value=Imm.value, mode='n'))
    s[mul_buf].tensorize(s[mul_buf].op.axis[0], env.intrins.get('MMulI', 
                            shape=(16,16), imm_value=Imm.value, mode='inc'))
    s[rsub_buf].tensorize(s[rsub_buf].op.axis[0], env.intrins.get('ISubM', 
                            shape=(16,16), imm_value=Imm.value, mode='n'))
    print(nnpu.lower(s, [a,add_host,mul_host,rsub_host], simple_mode=True))
    func = nnpu.build(s, [a,add_host,mul_host,rsub_host], 'nnpu', 'llvm', name='nnpu_vmuli')
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(16,16), dtype=a.dtype, low = 3, high = 100)
    a_nd = tvm.nd.array(a_np, ctx)

    add_nd = tvm.nd.array(np.zeros((16,16)).astype(add_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros((16,16)).astype(mul_host.dtype), ctx)
    rsub_nd = tvm.nd.array(np.zeros((16,16)).astype(rsub_host.dtype), ctx)
    func(a_nd, add_nd,mul_nd,rsub_nd)

    print(a_nd.asnumpy())
    print('add result is: ')
    print(add_nd.asnumpy())
    np.testing.assert_allclose(add_nd.asnumpy(), a_np + Imm.value)
    print('mul result is: ')
    print(mul_nd.asnumpy())
    np.testing.assert_allclose(mul_nd.asnumpy(), a_np.astype(dtype_w) * Imm.value)
    print('rsub result is: ')
    print(rsub_nd.asnumpy())
    np.testing.assert_allclose(rsub_nd.asnumpy(), Imm.value - a_np )
    print('test passed')
Beispiel #5
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (16, )
    bigshape = (4, 64)
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']

    sph = ScheduleProcHelper()

    a = tvm.placeholder(bigshape, dtype_n, 'a')
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    str_op = 'VAddMerge'
    k = tvm.reduce_axis((0, 4), 'k')
    c_buf = tvm.compute((64, ), lambda i: tvm.sum(a_buf[k, i], axis=k),
                        'c_buf')

    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    s = tvm.create_schedule(c_host.op)
    sph.Transform(s)
    #tensorize
    ko, ki = s[c_buf].split(c_buf.op.reduce_axis[0], factor=1)
    xo, xi = s[c_buf].split(c_buf.op.axis[0], factor=shape[0])
    s[c_buf].reorder(xo, ko, ki, xi)
    #s[c_buf].tensorize(ki, env.intrins.get(str_op,  mode='n'))

    print(nnpu.lower(s, [a, c_host], simple_mode=True))
    exit()
    func = nnpu.build(s, [a, c_host], 'nnpu', 'llvm', name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=bigshape, dtype=a.dtype, low=-4, high=4)
    a_nd = tvm.nd.array(a_np, ctx)

    c_nd = tvm.nd.array(np.zeros((64, ), dtype=c_host.dtype), ctx)

    func(a_nd, c_nd)
    print(str_op)
    print(c_nd.asnumpy())
    gt = np.sum(a_np, axis=0, dtype=dtype_w)
    print('ground truth=')
    print(gt)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)
Beispiel #6
0
def test():
    env = nnpu.get_env()
    shape = (16, )
    bigshape = (128, )
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    assert bigshape[0] % shape[0] == 0, 'the big vctr size is wrong'

    n_sheet = bigshape[0] // shape[0]
    sph = ScheduleProcHelper()

    a = tvm.placeholder(bigshape, dtype_n, 'a')
    b = tvm.placeholder(bigshape, dtype_n, 'b')
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    strop = 'VAddV'

    c_buf = tvm.compute(bigshape, lambda *i: a_buf(*i) + b_buf(*i), 'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'sum', sph)

    s = tvm.create_schedule(c_host.op)
    sph.Transform(s)
    #tensorize
    xo, xi = s[c_buf].split(c_buf.op.axis[0], factor=shape[0])
    s[c_buf].reorder(xo, xi)
    s[c_buf].tensorize(xi, env.intrins.get(strop, mode='n'))

    print(nnpu.lower(s, [a, b, c_host], simple_mode=True))

    func = nnpu.build(s, [a, b, c_host], 'nnpu', 'llvm', name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=bigshape, dtype=a.dtype, low=-4, high=4)
    a_nd = tvm.nd.array(a_np, ctx)
    b_np = np.random.randint(size=bigshape, dtype=b.dtype, low=-4, high=4)
    b_nd = tvm.nd.array(b_np, ctx)

    c_nd = tvm.nd.array(np.zeros(bigshape, dtype=c_host.dtype), ctx)

    func(a_nd, b_nd, c_nd)
    print(strop)
    print(c_nd.asnumpy())
    gt = a_np + b_np
    np.testing.assert_allclose(c_nd.asnumpy(), gt)
Beispiel #7
0
def test():
    env = nnpu.get_env()

    a = tvm.placeholder((4, 16), 'int16', 'a')
    b = tvm.placeholder((16, ), 'int16', 'b')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)
    k = tvm.reduce_axis((0, 16), 'k')
    c_buf = tvm.compute(
        (4, 1), lambda i, j: tvm.sum(a_buf[i, k] * b_buf[k], axis=k), 'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    s = tvm.create_schedule(c_host.op)
    sph.Transform(s)
    print(s[c_buf])
    s[c_buf].tensorize(s[c_buf].op.axis[1], env.intrins.get('VDotV', mode='w'))

    print(nnpu.lower(s, [a, b, c_host], simple_mode=True))
    func = nnpu.build(s, [a, b, c_host], 'nnpu', 'llvm', name='nnpu_func')

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ll'))

    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=0, high=64)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=(16, ), dtype=b.dtype, low=0, high=64)
    b_nd = tvm.nd.array(b_np, ctx)
    c_nd = tvm.nd.array(np.zeros((4, 1)).astype(c_host.dtype), ctx)

    func(a_nd, b_nd, c_nd)
    print(c_nd.asnumpy())
    print("numpy ground truth is")
    print(np.dot(a_np, b_np))
Beispiel #8
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)

    a = tvm.placeholder((4, 4, 16), 'int16', 'a')
    #b = tvm.placeholder((16, ), 'int16', 'b')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    #b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    k = tvm.reduce_axis((0, 4), 'k0')
    c_buf = tvm.compute((4, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k),
                        'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    s = tvm.create_schedule(c_host.op)
    sph.Transform(s)
    ko, ki = s[c_buf].split(c_buf.op.reduce_axis[0], factor=1)
    s[c_buf].reorder(c_buf.op.axis[0], ko, ki, c_buf.op.axis[1])
    s[c_buf].tensorize(ki, env.intrins.get('VAddMerge', mode='w', nDim=3))

    print(nnpu.lower(s, [a, c_host], simple_mode=True))
    func = nnpu.build(s, [a, c_host], 'nnpu', 'llvm', name='nnpu_exp')

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(4, 4, 16),
                             dtype=a.dtype,
                             low=-4000,
                             high=4000)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    c_nd = tvm.nd.array(np.zeros((4, 16)).astype(c_host.dtype), ctx)

    func(a_nd, c_nd)
    print(c_nd.asnumpy())
    print("numpy ground truth is")
    gt = np.sum(a_np, axis=0)
    print(gt)
Beispiel #9
0
def test():
    env = nnpu.get_env()
    a = tvm.placeholder((16, ), env.cfg['dtype_w'], 'a')
    sph = ScheduleProcHelper()
    Imm = tvm.const(5, env.cfg['dtype_w'])
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    #c_buf = tvm.compute((16, ), lambda i: tvm.select(a_buf[i]>Imm,a_buf[i],Imm), 'c_buf')
    c_buf = tvm.compute((16, ), lambda i: Imm+a_buf[i], 'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    sub_buf = tvm.compute((16, ), lambda i: a_buf[i] - Imm , 'sub_buf')
    sph.MarkScope(sub_buf)
    sub_host, sub_dram = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph)

    mul_buf = tvm.compute((16, ), lambda i: a_buf[i] * Imm, 'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    div_buf = tvm.compute((16, ), lambda i: a_buf[i] / Imm, 'rdiv_buf')
    sph.MarkScope(div_buf)
    div_host, div_dram = nnpu.utils.CopyBufToH(div_buf, 'rdiv', sph)

    gtm_buf = tvm.compute((16, ), lambda i: tvm.max(a_buf[i], Imm), 'gtm_buf')
    sph.MarkScope(gtm_buf)
    gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph)

    rsub_buf = tvm.compute((16, ), lambda i: Imm-a_buf[i], 'rsub_buf')
    sph.MarkScope(rsub_buf)
    rsub_host, rsub_dram = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph)


    s = tvm.create_schedule([c_host.op, sub_host.op, mul_host.op, div_host.op, gtm_host.op,rsub_host.op])
    sph.Transform(s)
    s[c_buf].tensorize(s[c_buf].op.axis[0], env.intrins.get('VAddI', imm_value=Imm.value,mode='w'))
    s[sub_buf].tensorize(s[sub_buf].op.axis[0], env.intrins.get('VSubI', imm_value=Imm.value,mode='w'))
    s[mul_buf].tensorize(s[mul_buf].op.axis[0], env.intrins.get('VMulI', imm_value=Imm.value,mode='w'))
    s[div_buf].tensorize(s[div_buf].op.axis[0], env.intrins.get('VDivI', imm_value=Imm.value,mode='w'))
    s[gtm_buf].tensorize(s[gtm_buf].op.axis[0], env.intrins.get('VGTMI', imm_value=Imm.value,mode='w'))
    s[rsub_buf].tensorize(s[rsub_buf].op.axis[0], env.intrins.get('ISubV', imm_value=Imm.value,mode='w'))
    print(nnpu.lower(s, [a,c_host,sub_host,mul_host,div_host,gtm_host,rsub_host], simple_mode=True))
    func = nnpu.build(s, [a,c_host,sub_host,mul_host,div_host,gtm_host,rsub_host], 'nnpu', 'llvm', name='nnpu_vmuli')

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ll'))

    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))
    
    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(16, ), dtype=a.dtype, low = 3, high = 122)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    c_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx)
    sub_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx)
    div_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx)
    gtm_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx)
    rsub_nd = tvm.nd.array(np.zeros((16, )).astype(c_host.dtype), ctx)
    func(a_nd, c_nd, sub_nd, mul_nd, div_nd, gtm_nd,rsub_nd)
    print('a = ')
    print(a_nd.asnumpy())
    print('a + {0} = '.format(Imm.value))
    print(c_nd.asnumpy())
    print('numpy ground truth =')
    gt = a_np + Imm.value
    print(gt)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)

    print('a - {0} = '.format(Imm.value))
    print(sub_nd.asnumpy())
    np.testing.assert_allclose(sub_nd.asnumpy(), a_np - Imm.value)

    print('a * {0} = '.format(Imm.value))
    print(mul_nd.asnumpy())
    np.testing.assert_allclose(mul_nd.asnumpy(), a_np * Imm.value)

    print('a / {0} = '.format(Imm.value))
    print(div_nd.asnumpy())
    np.testing.assert_allclose(div_nd.asnumpy(), a_np / Imm.value)

    print('a > {0} ? a : {0} = '.format(Imm.value))
    print(gtm_nd.asnumpy())
    #np.testing.assert_allclose(gtm_nd.asnumpy(), a_np  Imm.value)
    print('{0} - a = '.format(Imm.value))
    print(rsub_nd.asnumpy())
    np.testing.assert_allclose(rsub_nd.asnumpy(), Imm.value-a_np)
    print('test passed')
Beispiel #10
0
def test():
    env = nnpu.get_env()

    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    shape = (4, 16)
    a = tvm.placeholder(shape, dtype_n, 'a')
    b = tvm.placeholder((16, ), dtype_n, 'b')

    sph = ScheduleProcHelper()

    a_buf, _ = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, _ = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    sum_buf = tvm.compute(shape, lambda i, j: a_buf[i, j] + b_buf[j],
                          'sum_buf')
    sph.MarkScope(sum_buf)
    sum_host, _ = nnpu.utils.CopyBufToH(sum_buf, 'sum', sph)

    sub_buf = tvm.compute(shape, lambda i, j: a_buf[i, j] - b_buf[j],
                          'sub_buf')
    sph.MarkScope(sub_buf)
    sub_host, _ = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph)

    mul_buf = tvm.compute(
        shape,
        lambda i, j: a_buf[i, j].astype(dtype_w) * b_buf[j].astype(dtype_w),
        'sub_buf')
    sph.MarkScope(mul_buf)
    mul_host, _ = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    s = tvm.create_schedule([sum_host.op, sub_host.op, mul_host.op])
    sph.Transform(s)
    s[sum_buf].pragma(sum_buf.op.axis[0], 'nnpu.vector',
                      str({
                          'code': 'matrix-vector',
                          'shape': shape
                      }))
    s[sub_buf].pragma(sub_buf.op.axis[0], 'nnpu.vector',
                      str({
                          'code': 'matrix-vector',
                          'shape': shape
                      }))
    s[mul_buf].pragma(mul_buf.op.axis[0], 'nnpu.vector',
                      str({
                          'code': 'matrix-vector',
                          'shape': shape
                      }))

    print(nnpu.lower(s, [a, b, sum_host, sub_host, mul_host],
                     simple_mode=True))
    func = nnpu.build(s, [a, b, sum_host, sub_host, mul_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ir'))

    print('------------------- device module 1 uop code: ')
    print(func.imported_modules[0].get_source('uop'))

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=0, high=64)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=(16, ), dtype=b.dtype, low=0, high=64)
    b_nd = tvm.nd.array(b_np, ctx)
    sum_nd = tvm.nd.array(np.zeros(shape).astype(sum_host.dtype), ctx)
    sub_nd = tvm.nd.array(np.zeros(shape).astype(sub_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros(shape).astype(mul_host.dtype), ctx)

    func(a_nd, b_nd, sum_nd, sub_nd, mul_nd)
    gt = a_np + b_np
    np.testing.assert_allclose(sum_nd.asnumpy(), gt)

    gt = a_np - b_np
    np.testing.assert_allclose(sub_nd.asnumpy(), gt)

    gt = a_np.astype(dtype_w) * b_np
    np.testing.assert_allclose(mul_nd.asnumpy(), gt)
    print('test passed')
Beispiel #11
0
def test():
    env = nnpu.get_env()

    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    shape = (4, 64)
    # nvctr_unit = env.cfg['vector_unit']['size']
    nvctr_unit = 32
    # assert shape[0] % nvctr_unit == 0, 'error'

    a = tvm.placeholder(shape, dtype_n, 'a')
    b = tvm.placeholder(shape, dtype_n, 'b')

    sph = ScheduleProcHelper()

    b_scope = 'buffer0'

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph, dst_scope=b_scope)

    c_buf = tvm.compute(shape, lambda *i: a_buf(*i) + b_buf(*i), 'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    mul_buf = tvm.compute(
        shape,
        lambda *i: a_buf(*i).astype(dtype_w) * b_buf(*i).astype(dtype_w),
        'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    gtm_buf = tvm.compute(shape, lambda *i: tvm.max(a_buf(*i), b_buf(*i)),
                          'gtm_buf')
    sph.MarkScope(gtm_buf)
    gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph)

    s = tvm.create_schedule([c_host.op, mul_host.op, gtm_host.op])
    sph.Transform(s)

    # x = s[c_buf].fuse(*c_buf.op.axis)
    # xo, xi = s[c_buf].split(x, factor=nvctr_unit)
    params = dict()
    params['code'] = 'binary'
    params['size'] = nvctr_unit
    x = s[c_buf].fuse(*c_buf.op.axis)
    xo, xi = s[c_buf].split(x, factor=nvctr_unit)
    s[c_buf].pragma(xi, 'nnpu.vector', str(params))

    x = s[mul_buf].fuse(*mul_buf.op.axis)
    xo, xi = s[mul_buf].split(x, factor=nvctr_unit)
    s[mul_buf].pragma(xi, 'nnpu.vector', str(params))

    x = s[gtm_buf].fuse(*gtm_buf.op.axis)
    xo, xi = s[gtm_buf].split(x, factor=nvctr_unit)
    s[gtm_buf].pragma(xi, 'nnpu.vector', str(params))

    print(tvm.lower(s, [a, b, c_host, mul_host, gtm_host], simple_mode=True))
    print(nnpu.lower(s, [a, b, c_host, mul_host, gtm_host], simple_mode=True))
    # exit()
    func = nnpu.build(s, [a, b, c_host, mul_host, gtm_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_exp')
    print('------------------- device module 1 IR: ')
    print(func.imported_modules[0].get_source('ir'))

    print('------------------- device module 1 micro code: ')
    print(func.imported_modules[0].get_source('uop'))

    # exit()

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=shape, dtype=a.dtype, low=-64, high=63)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=shape, dtype=b.dtype, low=-64, high=63)
    b_nd = tvm.nd.array(b_np, ctx)

    c_nd = tvm.nd.array(np.zeros(shape).astype(c_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros(shape).astype(mul_host.dtype), ctx)
    gtm_nd = tvm.nd.array(np.zeros(shape).astype(gtm_host.dtype), ctx)

    # print('------------------- device module 1 llvm IR: ')
    # print(func.imported_modules[0].get_source('ll'))

    # print('------------------- device module 1 asm code: ')
    # print(func.imported_modules[0].get_source('asm'))

    func(a_nd, b_nd, c_nd, mul_nd, gtm_nd)

    gt = a_np + b_np
    np.testing.assert_allclose(c_nd.asnumpy(), gt)
    gt = np.multiply(a_np, b_np, dtype=mul_host.dtype)
    np.testing.assert_allclose(mul_nd.asnumpy(), gt)
    gt = np.maximum(a_np, b_np)
    np.testing.assert_allclose(gtm_nd.asnumpy(), gt)
    print('test passed!!')
Beispiel #12
0
    core_extent = 4
    xh, xw = s[res_host].op.axis
    xwo, xwi = s[res_host].split(xw, nparts=core_extent)
    s[res_host].reorder(xwo, xh, xwi)
    s[res_host].pragma(xh, env.dma_copy_from_buf)
    # compute_at
    s[a_buf].compute_at(s[res_host], xwo)
    s[b_buf].compute_at(s[res_host], xwo)
    s[res_acc].compute_at(s[res_host], xwo)
    s[res_buf].compute_at(s[res_host], xwo)
    # thread bind
    s[res_host].bind(xwo, tvm.thread_axis('coreIdx'))

    print(nnpu.lower(s, [a, b, res_host], simple_mode=True))

    func = nnpu.build(s, [a, b, res_host], 'nnpu', 'llvm', 'nnpu_func')
    # print('------------------- device module 1 asm code: ')
    # print(func.imported_modules[0].get_source('ll'))

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=shape1_tiled,
                             dtype=a.dtype,
                             low=-16,
                             high=16)
    a_nd = tvm.nd.array(a_np, ctx)
    b_np = np.random.randint(size=shape2_tiled,
                             dtype=b.dtype,
                             low=-16,
                             high=16)
    b_nd = tvm.nd.array(b_np, ctx)
Beispiel #13
0
def test():
    env = nnpu.get_env()

    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder((64, ), dtype_n, 'a')
    b = tvm.placeholder((1, ), dtype_n, 'b')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    c_buf = tvm.compute((64, ), lambda i: a_buf[i] + b_buf[0], 'c_buf')
    sph.MarkScope(c_buf)
    c_host, _ = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    sub_buf = tvm.compute((64, ), lambda i: a_buf[i] - b_buf[0], 'sub_buf')
    sph.MarkScope(sub_buf)
    sub_host, _ = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph)

    rsub_buf = tvm.compute((64, ), lambda i: b_buf[0] - a_buf[i], 'rsub_buf')
    sph.MarkScope(rsub_buf)
    rsub_host, _ = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph)

    mul_buf = tvm.compute(
        (64, ), lambda i: a_buf[i].astype(dtype_w) * b_buf[0].astype(dtype_w),
        'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, _ = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    div_buf = tvm.compute((64, ), lambda i: a_buf[i] / b_buf[0], 'div_buf')
    sph.MarkScope(div_buf)
    div_host, _ = nnpu.utils.CopyBufToH(div_buf, 'div', sph)

    rdiv_buf = tvm.compute((64, ), lambda i: b_buf[0] / a_buf[i], 'rdiv_buf')
    sph.MarkScope(rdiv_buf)
    rdiv_host, _ = nnpu.utils.CopyBufToH(rdiv_buf, 'rdiv', sph)

    gtm_buf = tvm.compute((64, ), lambda i: tvm.max(a_buf[i], b_buf[0]),
                          'gtm_buf')
    sph.MarkScope(gtm_buf)
    gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph)

    s = tvm.create_schedule([
        c_host.op, sub_host.op, mul_host.op, rsub_host.op, div_host.op,
        rdiv_host.op, gtm_host.op
    ])
    sph.Transform(s)
    xo, xi = s[c_buf].split(c_buf.op.axis[0], 16)
    s[c_buf].tensorize(xi, env.intrins.get('VAddS', mode='n'))
    xo, xi = s[sub_buf].split(sub_buf.op.axis[0], 16)
    s[sub_buf].tensorize(xi, env.intrins.get('VSubS', mode='n'))
    xo, xi = s[rsub_buf].split(rsub_buf.op.axis[0], 16)
    s[rsub_buf].tensorize(xi, env.intrins.get('SSubV', mode='n'))
    xo, xi = s[mul_buf].split(mul_buf.op.axis[0], 16)
    s[mul_buf].tensorize(xi, env.intrins.get('VMulS', mode='inc'))
    xo, xi = s[div_buf].split(div_buf.op.axis[0], 16)
    s[div_buf].tensorize(xi, env.intrins.get('VDivS', mode='n'))
    xo, xi = s[rdiv_buf].split(rdiv_buf.op.axis[0], 16)
    s[rdiv_buf].tensorize(xi, env.intrins.get('SDivV', mode='n'))
    xo, xi = s[gtm_buf].split(gtm_buf.op.axis[0], 16)
    s[gtm_buf].tensorize(xi, env.intrins.get('VGTMS', mode='n'))

    print(
        nnpu.lower(s, [
            a, b, c_host, sub_host, mul_host, rsub_host, div_host, rdiv_host,
            gtm_host
        ],
                   simple_mode=True))
    func = nnpu.build(s, [
        a, b, c_host, sub_host, mul_host, rsub_host, div_host, rdiv_host,
        gtm_host
    ],
                      'nnpu',
                      'llvm',
                      name='nnpu_exp')

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(64, ), dtype=a.dtype, low=1, high=63)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=(1, ), dtype=b.dtype, low=2, high=31)
    b_nd = tvm.nd.array(b_np, ctx)

    c_nd = tvm.nd.array(np.zeros((64, )).astype(c_host.dtype), ctx)
    sub_nd = tvm.nd.array(np.zeros((64, )).astype(sub_host.dtype), ctx)
    rsub_nd = tvm.nd.array(np.zeros((64, )).astype(rsub_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros((64, )).astype(mul_host.dtype), ctx)
    div_nd = tvm.nd.array(np.zeros((64, )).astype(div_host.dtype), ctx)
    rdiv_nd = tvm.nd.array(np.zeros((64, )).astype(rdiv_host.dtype), ctx)
    gtm_nd = tvm.nd.array(np.zeros((64, )).astype(gtm_host.dtype), ctx)

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ll'))

    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))

    func(a_nd, b_nd, c_nd, sub_nd, mul_nd, rsub_nd, div_nd, rdiv_nd, gtm_nd)
    print('a = ')
    print(a_np)
    print('b = ')
    print(b_np)
    print('a + b =')
    print(c_nd.asnumpy())
    print('numpy ground truth =')
    gt = a_np + b_np
    print(gt)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)
    print('a - b =')
    print(sub_nd.asnumpy())
    np.testing.assert_allclose(sub_nd.asnumpy(), a_np - b_np)

    print('b - a =')
    print(rsub_nd.asnumpy())
    np.testing.assert_allclose(rsub_nd.asnumpy(), b_np - a_np)

    print('a * b =')
    print(mul_nd.asnumpy())
    np.testing.assert_allclose(mul_nd.asnumpy(), a_np * b_np.astype(dtype_w))

    print('a / b =')
    print(div_nd.asnumpy())
    # numpy always round down, while in c, the numerator will be rounded to zero.
    #np.testing.assert_allclose(div_nd.asnumpy(), a_np / b_np)

    print('b / a =')
    print(rdiv_nd.asnumpy())

    print('max(a, b)=')
    print(gtm_nd.asnumpy())
Beispiel #14
0
    co, ci = s[pooling_host].split(c, dim_c)
    s[pooling_host].reorder(x_vt, xo, yo, co, xi, yi, ci)
    s[pooling_buf].compute_at(s[pooling_host], co)
    s[pooling_host].bind(x_vt, tvm.thread_axis('cthread'))

    # pragma
    s[data_buf].pragma(data_buf.op.axis[0], env.dma_copy_to_buf)
    s[pooling_host].pragma(xi, env.dma_copy_from_buf)
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#
    # print(tvm.lower(s, [data, pooling_host], simple_mode=True))
    print(nnpu.lower(s, [data, pooling_host], simple_mode=True))
    # exit()
    func = nnpu.build(s, [data, pooling_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    data_np = np.random.randint(size=in_shape,
                                dtype=data.dtype,
                                low=-128,
                                high=127)
    data_nd = tvm.nd.array(data_np, ctx)

    res_nd = tvm.nd.array(np.zeros(out_shape, dtype=pooling_host.dtype), ctx)

    func(data_nd, res_nd)

    gt = max_pooling(in_shape, out_shape, cell_shape, data_np, data.dtype)
    # print(gt)
Beispiel #15
0
    xo, xi = s[prod_buf].split(prod_buf.op.axis[0], factor=gemm_shape[0])
    ro, ri = s[prod_buf].split(prod_buf.op.reduce_axis[0], factor=factor)
    #ri = prod_buf.op.reduce_axis[0]
    s[prod_buf].reorder(xo, ro, xi, ri)
    print(tvm.lower(s, [a, b, out_host], simple_mode=True))
    s[prod_buf].tensorize(
        xi,
        env.intrins.get('GEMM',
                        shape=gemm_shape,
                        reduce=True,
                        mode='inc',
                        scope_out='acc'))
    print(nnpu.lower(s, [a, b, out_host], simple_mode=True))

    func = nnpu.build(s, [a, b, out_host], 'nnpu', 'llvm', name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=shape, dtype=a.dtype, low=-32, high=32)
    a_nd = tvm.nd.array(a_np, ctx)
    b_np = np.random.randint(size=(shape[1], ),
                             dtype=b.dtype,
                             low=-32,
                             high=32)
    b_nd = tvm.nd.array(b_np, ctx)

    out_nd = tvm.nd.array(np.zeros(shape[0], dtype=out_host.dtype), ctx)

    func(a_nd, b_nd, out_nd)

    print(out_nd.asnumpy())
Beispiel #16
0
    s[pooling_buf].compute_at(s[res_host], pwi)

    # add copy pragma.
    s[feature_buf].pragma(feature_buf.op.axis[-1], env.dma_copy_to_buf)
    s[kernel_buf].pragma(kernel_buf.op.axis[-1], env.dma_copy_to_buf)
    s[res_host].pragma(oci, env.dma_copy_from_buf)
    s[conv_buf].pragma(conv_buf.op.axis[1], env.copy_acc2buf)
    s[conv].pragma(s[conv].leaf_iter_vars[-2], env.scratchpad_copy)
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#

    print(nnpu.lower(s, [feature, kernel, res_host], simple_mode=True))

    # func = tvm.build(s, [feature, kernel, res_host], 'llvm', 'llvm', 'nnpu_conv')
    func = nnpu.build(s, [feature, kernel, res_host], 'nnpu', 'llvm', 'nnpu_conv')
    # print('------------------- device module 1 asm code: ')
    # print(func.imported_modules[0].get_source('asm'))
    print('------------------- device module 1 TVM IR: ')
    print(func.imported_modules[0].get_source('ir'))
    print('------------------- device module 1 uop: ')
    print(func.imported_modules[0].get_source('uop'))
    # exit(0)

    ctx = tvm.nd.TVMContext(13, 0)
    fm_np = np.random.randint(size=shape, dtype=feature.dtype, low = -16, high = 16)
    fm_nd = tvm.nd.array(fm_np, ctx)

    k_np = np.random.randint(size=kshape, dtype=kernel.dtype, low = -16, high = 16)
    k_nd = tvm.nd.array(k_np, ctx)
Beispiel #17
0
def test():
    with ScheduleProcHelper():
        env = nnpu.get_env()
        # nnpu.set_dump(True)

        dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
        shape = (48, )
        nvctr_unit = env.cfg['vector_unit']['size']
        assert shape[0] % nvctr_unit == 0, 'error'

        a = tvm.placeholder(shape, dtype_n, 'a')
        b = tvm.placeholder(shape, dtype_n, 'b')

        sph = ScheduleProcHelper.current

        a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
        b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

        c_buf = tvm.compute(shape, lambda i: a_buf[i] + b_buf[i], 'c_buf')
        sph.MarkScope(c_buf)
        c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

        plus2 = tvm.compute(shape, lambda i: c_host[i] + tvm.const(2, 'int8'),
                            'plus2')

        s = tvm.create_schedule([plus2.op])
        sph.Transform(s)

        xo, xi = s[c_buf].split(c_buf.op.axis[0], factor=nvctr_unit)
        s[c_buf].tensorize(xi, env.intrins.get('VAddV', mode='n'))

        print(nnpu.lower(s, [a, b, plus2], simple_mode=True))
        # exit()
        func = nnpu.build(s, [a, b, plus2], 'nnpu', 'llvm', name='nnpu_exp')
        # exit()

        ctx = tvm.nd.TVMContext(13, 0)

        # print('------------------- host module llvm IR: ')
        # print(func.get_source('ll'))
        print('------------------- device module 1 llvm IR: ')
        print(func.imported_modules[0].get_source('ll'))
        print('------------------- device module 1 asm code: ')
        print(func.imported_modules[0].get_source('asm'))

        for i in range(0, 5):
            a_np = np.random.randint(size=shape,
                                     dtype=a.dtype,
                                     low=-64,
                                     high=63)
            #a_np = np.random.random(size=shape).astype(a_host.dtype)
            a_nd = tvm.nd.array(a_np, ctx)

            b_np = np.random.randint(size=shape,
                                     dtype=b.dtype,
                                     low=-64,
                                     high=63)
            b_nd = tvm.nd.array(b_np, ctx)

            # c_nd = tvm.nd.array(np.zeros(shape).astype(c_host.dtype), ctx)
            plus2_nd = tvm.nd.array(np.zeros(shape).astype(plus2.dtype), ctx)

            # exit()
            func(a_nd, b_nd, plus2_nd)

            print('a = ')
            print(a_np)
            print('b = ')
            print(b_np)
            print('a + b + 2 =')
            print(plus2_nd.asnumpy())
            print("numpy ground truth is")
            gt = a_np + b_np + 2
            print(gt)
            np.testing.assert_allclose(plus2_nd.asnumpy(), gt)
        print('test passed!!')
Beispiel #18
0
    s[out_buf].pragma(xi, env.copy_acc2buf)

    # split and tensorize VAddV.
    nvctr_unit = env.cfg['vector_unit']['size']
    xo, xi = s[res_buf].split(res_buf.op.axis[0], factor=nvctr_unit)
    s[res_buf].tensorize(xi, env.intrins.get('VAddV', mode='w'))
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#

    # with nnpu.build_config(dump_pass_ir=True):
    with nnpu.build_config():
        print(nnpu.lower(s, [weight, data, bias, res_host], simple_mode=True))

    func = nnpu.build(s, [weight, data, bias, res_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')
    print('------------------- device module 1 TVM IR: ')
    print(func.imported_modules[0].get_source('ir'))
    print('------------------- device module 1 uop: ')
    print(func.imported_modules[0].get_source('uop'))

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=weight_shape,
                             dtype=weight.dtype,
                             low=-32,
                             high=32)
    a_nd = tvm.nd.array(a_np, ctx)
    d_np = np.random.randint(size=data_shape,
                             dtype=data.dtype,
                             low=-32,
Beispiel #19
0
    s[sum1].reorder(xblock, xrow, xcol)
    s[sum1].tensorize(
        xrow,
        env.intrins.get('MReduceSumRow',
                        shape=(nRow, factor),
                        scope_out='acc',
                        mode='w'))

    s[sum2].tensorize(sum2.op.reduce_axis[0],
                      env.intrins.get('VReduceSum', shape=(8, ), mode='w'))

    xo, xi = s[softmax].split(softmax.op.axis[0], 16)
    s[softmax].tensorize(xi, env.intrins.get('VDivS', mode='w'))

    print(nnpu.lower(s, [a, softmax_host], simple_mode=True))
    func = nnpu.build(s, [a, softmax_host], 'nnpu', 'llvm', 'nnpu_func')
    print('------------------- device module 1 IR: ')
    print(func.imported_modules[0].get_source('ir'))

    print('------------------- device module 1 micro code: ')
    print(func.imported_modules[0].get_source('uop'))
    # exit()

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.random(shape).astype(a.dtype) * 2
    a_nd = tvm.nd.array(a_np, ctx)

    # sigmoid_nd = tvm.nd.array(np.zeros(shape, dtype=sigmoid_host.dtype), ctx)
    softmax_nd = tvm.nd.array(np.zeros(shape, dtype=softmax_host.dtype), ctx)

    func(a_nd, softmax_nd)
Beispiel #20
0
def test():
    env = nnpu.get_env()
    nnpu.set_dump(False)

    #==================================#
    # ------ first define shapes ------
    #==================================#
    
    # input data layout: HWC
    in_shape = (32, 32, 128)
    # pooling windows size, height == width.
    cell_shape = 4
    # in this demo we don't do padding, so input data height and width must be divisible to pooling window size.
    assert in_shape[0] % cell_shape == 0, 'error'
    assert in_shape[1] % cell_shape == 0, 'error'
    nvctr_unit = env.cfg['vector_unit']['size']
    assert in_shape[2] % nvctr_unit == 0, 'channel not divisible to vector unit size'

    out_shape = (in_shape[0] // cell_shape,in_shape[1] // cell_shape,in_shape[2])
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    
    sph = ScheduleProcHelper()
    str_op = 'VGTMMerge'

    #=================================================================#
    # ------ after all shapes defined, begin compute describing. ------
    #=================================================================#
    a = tvm.placeholder(in_shape, dtype_w, 'a')
    # first copy to scratchpad.
    a_buf, _1 = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    # stage 1, find the maximum pixel in every pooling window.
    # the extent of two reduction axes are sizes of pooling window.
    k1 = tvm.reduce_axis((0,cell_shape), 'k1')
    k2 = tvm.reduce_axis((0,cell_shape), 'k2')
    pooling_buf = tvm.compute(out_shape, 
                        lambda i,j,k: 
                         tvm.max(a_buf[i * cell_shape + k1, j * cell_shape + k2, k],
                                 axis=[k1, k2]),
                       'pooling_buf')
    sph.MarkScope(pooling_buf, 'buffer1')
    
    # copy back to host.    
    step2_host, step2_dram = nnpu.utils.CopyBufToH(pooling_buf, 'pooling',sph)
    # ------ this ends the computation description. ------

    #==================================#
    # ------ begin scheduling ------
    #==================================#
    s = tvm.create_schedule(step2_host.op)    
    sph.Transform(s)

    #tensorize
    i, j, k = pooling_buf.op.axis
    k1, k2 = pooling_buf.op.reduce_axis
    # split the reduce_axis by factor 1, to produce a dummy reduce axis. 
    # this is a trick to enable tensorize, due to limitation of tvm's tensorize pattern matcher.
    ko, ki = s[pooling_buf].split(k2, factor=1)
    xo, xi = s[pooling_buf].split(k, factor=16)
    # reorder axes.
    # put xo right before ki to eliminate memory dependency between two consecutive VGTMV instruction
    s[pooling_buf].reorder( i, j, k1, ko, xo, ki, xi)
    s[pooling_buf].tensorize(ki, env.intrins.get(str_op, scope_out='buffer1', mode='w'))
    # unroll
    # s[pooling_buf].unroll(ko)
    # s[pooling_buf].unroll(xo)
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#

    print(nnpu.lower(s, [a, step2_host], simple_mode=True))
    # exit()
    func = nnpu.build(s, [a, step2_host], 'nnpu', 'llvm', name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=in_shape, dtype=a.dtype, low = -128, high = 127)
    a_nd = tvm.nd.array(a_np, ctx)

    c_nd = tvm.nd.array(np.zeros(out_shape, dtype=step2_host.dtype), ctx)

    func(a_nd, c_nd)
    # print("pooling-max")
    # print(c_nd.asnumpy())
    
    # print("nppooling-max")
    gt=max_pooling(in_shape,out_shape,cell_shape,a_np,a.dtype)
    # print(gt)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)
    print('test passed')
Beispiel #21
0
    vector_unit_size = 32
    xo, xi = s[exp_buf].split(exp_buf.op.axis[0], vector_unit_size)
    s[exp_buf].tensorize(xi, env.intrins.get('VExp', mode='w', size=vector_unit_size))

    xo, xi = s[log_buf].split(log_buf.op.axis[0], vector_unit_size)
    s[log_buf].tensorize(xi, env.intrins.get('VLog', mode='w', size=vector_unit_size))

    xo, xi = s[tanh_buf].split(tanh_buf.op.axis[0], vector_unit_size)
    s[tanh_buf].tensorize(xi, env.intrins.get('VTanh', mode='w', size=vector_unit_size))

    xo, xi = s[sigmoid_buf].split(sigmoid_buf.op.axis[0], vector_unit_size)
    s[sigmoid_buf].tensorize(xi, env.intrins.get('VSigmoid', mode='w', size=vector_unit_size))

    print(nnpu.lower(s, [a, exp, log, tanh, sigmoid], simple_mode=True))

    func = nnpu.build(s, [a, exp, log, tanh, sigmoid], 'nnpu', 'llvm', 'nnpu_func')
    print('------------------- device module 1 IR: ')
    print(func.imported_modules[0].get_source('ir'))

    print('------------------- device module 1 micro code: ')
    print(func.imported_modules[0].get_source('uop'))
    # exit()

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.random(shape).astype(a.dtype) * 2
    a_nd = tvm.nd.array(a_np, ctx)

    exp_nd = tvm.nd.array(np.zeros(shape, dtype=exp.dtype), ctx)
    log_nd = tvm.nd.array(np.zeros(shape, dtype=log.dtype), ctx)
    tanh_nd = tvm.nd.array(np.zeros(shape, dtype=tanh.dtype), ctx)
    sigmoid_nd = tvm.nd.array(np.zeros(shape, dtype=sigmoid.dtype), ctx)
Beispiel #22
0
    s[out_host].pragma(yi, env.dma_copy_from_buf)

    # bind to virtual thread
    # s[out_host].bind(by, tvm.thread_axis("cthread"))

    # compute_at
    s[out_acc].compute_at(s[out_host], xo)
    # s[out_buf].compute_at(s[out_host], xi)
    s[res_buf].compute_at(s[out_host], xi)
    # s[bias_buf].compute_at(s[out_host], by)

    print(tvm.lower(s, [a, b, bias, out_host], simple_mode=True))
    # exit()
    print(nnpu.lower(s, [a, b, bias, out_host], simple_mode=True))
    # exit(0)
    func = nnpu.build(s, [a, b, bias, out_host], 'nnpu', 'llvm', 'nnpu_func')
    print('------------------- device module 1 TVM IR: ')
    print(func.imported_modules[0].get_source('ir'))
    print('------------------- device module 1 uop: ')
    print(func.imported_modules[0].get_source('uop'))

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=shape1, dtype=a.dtype, low=-16, high=16)
    a_nd = tvm.nd.array(a_np, ctx)
    b_np = np.random.randint(size=shape2, dtype=b.dtype, low=-16, high=16)
    b_nd = tvm.nd.array(b_np, ctx)
    bias_np = np.random.randint(size=(shape2[0], ),
                                dtype=bias.dtype,
                                low=-128,
                                high=127)
    # bias_np = np.zeros((shape2[1], ), dtype=bias.dtype)
Beispiel #23
0
    #==================================#
    # ------ begin scheduling ------
    #==================================#
    s = nnpu.create_schedule([tile_host.op])

    # since all operations are scratchpad copy, all we need to do is pragma.
    # this is done by the helper functions, so nothing to do here.
    
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#

    print(tvm.lower(s, [a, tile_host], simple_mode=True))
    print(nnpu.lower(s, [a, tile_host], simple_mode=True))
    func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func')
    print('------------------- device module 1 TVM IR: ')
    print(func.imported_modules[0].get_source('ir'))
    print('------------------- device module 1 uop: ')
    print(func.imported_modules[0].get_source('uop'))
    # exit()

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(8, 8), dtype=a.dtype, low = -10000, high = 10000)
    a_nd = tvm.nd.array(a_np, ctx)

    #b_np = np.random.randint(size=(4, 32), dtype=b.dtype, low = -10000, high = 10000)
    #b_nd = tvm.nd.array(b_np, ctx)

    re_nd = tvm.nd.array(np.zeros((2, 2, 4, 4), dtype=tile_host.dtype), ctx)
    
Beispiel #24
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (2, 16)
    a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
    print('a host ' + str(a_host))
    a = tvm.compute(shape, lambda *i: a_host(*i), name='a')
    a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf')
    b_buf = tvm.compute(
        shape,
        lambda i, j: tvm.log(a_buf[i, j].astype(env.cfg['dtype_w'])),
        name='b_buf')
    b = tvm.compute(shape, lambda *i: b_buf(*i), name='b')
    b_host = tvm.compute(shape, lambda *i: b(*i), name='b_host')

    s = tvm.create_schedule(b_host.op)

    # mark variable scopes
    s[a].set_scope(env.dram_scope)
    s[b].set_scope(env.dram_scope)

    s[a_buf].set_scope(env.uni_scratchpad_scope)
    s[b_buf].set_scope(env.uni_scratchpad_scope)

    #print
    # (dir(s[b].op.body))

    # mark compiler pragmas
    s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma)
    s[b_host].pragma(s[b_host].op.axis[0], env.dma_copy_pragma)

    s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls)
    s[b].pragma(s[b].op.axis[0], env.scratchpad_ls)

    s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0])

    # tensorize
    s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VLOG',
                                                            mode='inc'))

    # build
    print(tvm.lower(s, [a_host, b_host], simple_mode=True))
    print(nnpu.lower(s, [a_host, b_host], simple_mode=True))
    #exit()
    func = nnpu.build(s, [a_host, b_host], 'nnpu', 'llvm', name='nnpu_log')

    print('function built: ')
    #print(func.get_source())

    # prepare data
    ctx = tvm.nd.TVMContext(13, 0)  #???
    print('i want to know:')
    print(ctx.exist)
    a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=1, high=20)
    a_nd = tvm.nd.array(a_np, ctx)
    b_nd = tvm.nd.array(np.zeros(shape).astype(b_host.dtype), ctx)

    # run
    func(a_nd, b_nd)

    print('run finished')

    b_np = b_nd.asnumpy()
    print('a=')
    print(a_np)
    print('b=')
    print(b_np)
    print('ground truth =')
    gt = np.log(a_np, dtype=b_host.dtype)
    print(gt)
    np.testing.assert_allclose(b_np, gt)
Beispiel #25
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (2, 2, 16)
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, 'a')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    k = tvm.reduce_axis((0, 2), 'k')
    add_buf = tvm.compute(
        (2, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'add_buf')
    sph.MarkScope(add_buf)
    add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph)

    k1 = tvm.reduce_axis((0, 2), 'k1')
    mul_buf = tvm.compute(
        (2, 16), lambda i, j: tvm.sum(a_buf[k1, i, j], axis=k1), 'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    s = tvm.create_schedule([add_host.op, mul_host.op])
    sph.Transform(s)

    ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1)
    s[add_buf].reorder(ko, ki, *(s[add_buf].op.axis))
    s[add_buf].tensorize(ki, env.intrins.get('MAddMerge',
                                             shape=shape,
                                             mode='w'))

    ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1)
    s[mul_buf].reorder(ko1, ki1, *(s[mul_buf].op.axis))
    s[mul_buf].tensorize(ki1,
                         env.intrins.get('MMulMerge', shape=shape, mode='w'))

    print(nnpu.lower(s, [a, add_host, mul_host], simple_mode=True))

    func = nnpu.build(s, [a, add_host, mul_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')
    #exit()
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(2, 2, 16), dtype=a.dtype, low=-16, high=16)
    a_nd = tvm.nd.array(a_np, ctx)

    add_nd = tvm.nd.array(np.zeros((2, 16)).astype(add_host.dtype), ctx)

    mul_nd = tvm.nd.array(np.zeros((2, 16)).astype(mul_host.dtype), ctx)

    func(a_nd, add_nd, mul_nd)

    print('a = ')
    print(a_np)
    print('reduce sum row = ')
    print(add_nd.asnumpy())
    print('ground truth is: ')
    gt = np.sum(a_np, axis=0)
    print(gt)
    np.testing.assert_allclose(add_nd.asnumpy(), gt)

    print('reduce mul row = ')
    print(mul_nd.asnumpy())
    gt = np.multiply.reduce(a_np, axis=0, dtype=a.dtype)
    print(gt)
    np.testing.assert_allclose(mul_nd.asnumpy(), gt)
Beispiel #26
0
def test():
    env = nnpu.get_env()

    shape = (16, 16)
    a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
    a = tvm.compute(shape, lambda *i: a_host(*i), name='a')
    a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf')

    vctr_shape = (1, 16)
    b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host')
    b = tvm.compute(vctr_shape, lambda *i: b_host(*i), name='b')
    b_buf = tvm.compute(vctr_shape, lambda *i: b(*i), name='b_buf')

    dtype_w = env.cfg['dtype_w']

    mul_shape = (1, 16)
    k = tvm.reduce_axis((0, 16), 'k')
    c_buf = tvm.compute(
        mul_shape, lambda i, j: tvm.sum(
            b_buf[i, k].astype(dtype_w) * a_buf[j, k].astype(dtype_w), axis=k))

    out_shape = (16, )
    bias_host = tvm.placeholder(out_shape, env.cfg['dtype_w'], 'bias_host')
    bias = tvm.compute(out_shape, lambda *i: bias_host(*i), 'bias')
    bias_buf = tvm.compute(out_shape, lambda *i: bias(*i), 'bias_buf')
    #c = tvm.compute(out_shape, lambda *i: c_buf(*i), name='c')
    #c_host = tvm.compute(out_shape, lambda *i: c(*i), name='c_host')

    out_buf = tvm.compute(out_shape, lambda i: c_buf[0, i] + bias_buf[i],
                          'out_buf')
    out = tvm.compute(out_shape, lambda *i: out_buf(*i), 'out')
    out_host = tvm.compute(out_shape, lambda *i: out(*i), 'out_host')

    s = tvm.create_schedule(out_host.op)

    # mark variable scopes
    s[a].set_scope(env.dram_scope)
    s[b].set_scope(env.dram_scope)
    s[bias].set_scope(env.dram_scope)
    s[out].set_scope(env.dram_scope)

    s[a_buf].set_scope(env.uni_scratchpad_scope)
    s[b_buf].set_scope(env.uni_scratchpad_scope)
    s[c_buf].set_scope(env.uni_scratchpad_scope)
    s[bias_buf].set_scope(env.uni_scratchpad_scope)
    s[out_buf].set_scope(env.uni_scratchpad_scope)

    #print(dir(s[b].op.body))

    # mark compiler pragmas
    s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma)
    s[b].pragma(s[b].op.axis[0], env.dma_copy_pragma)
    s[bias].pragma(s[bias].op.axis[0], env.dma_copy_pragma)
    s[out_host].pragma(s[out_host].op.axis[0], env.dma_copy_pragma)

    s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls)
    s[b_buf].pragma(s[b_buf].op.axis[0], env.scratchpad_ls)
    s[bias_buf].pragma(s[bias_buf].op.axis[0], env.scratchpad_ls)
    s[out].pragma(s[out].op.axis[0], env.scratchpad_ls)

    #s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0])

    # tensorize
    #s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VEXP', mode='inc'))
    s[c_buf].tensorize(s[c_buf].op.axis[0],
                       env.intrins.get('GEMM', shape=(1, 16, 16), mode='inc'))
    #outer, inner = out_buf.op.axis
    #s[out_buf].reorder(inner, outer)
    #print(outer)
    #print(tvm.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True))
    s[out_buf].tensorize(s[out_buf].op.axis[0],
                         env.intrins.get('VAddV', mode='w'))

    # build
    print(tvm.lower(s, [a_host, b_host, bias_host, out_host],
                    simple_mode=True))

    print(
        nnpu.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True))
    #exit()
    func = nnpu.build(s, [a_host, b_host, bias_host, out_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_exp')

    print('function built: ')
    #print(func.get_source())

    # prepare data
    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=0, high=64)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=vctr_shape,
                             dtype=b_host.dtype,
                             low=0,
                             high=64)
    #b_np = np.random.random(size=vctr_shape).astype(b_host.dtype)
    b_nd = tvm.nd.array(b_np, ctx)

    bias_np = np.random.randint(size=out_shape,
                                dtype=bias_host.dtype,
                                low=0,
                                high=64)
    #bias_np = np.random.random(size=out_shape).astype(bias_host.dtype)
    bias_nd = tvm.nd.array(bias_np, ctx)

    out_nd = tvm.nd.array(np.zeros(out_shape).astype(out_host.dtype), ctx)

    # run
    func(a_nd, b_nd, bias_nd, out_nd)

    print('run finished')

    print('a=')
    print(a_np)
    print('b=')
    print(b_np)
    print('bias=')
    print(bias_np)
    print('out=')
    print(out_nd.asnumpy())

    print('numpy ground truth is: ')
    gt = np.dot(b_np.astype(dtype_w),
                a_np.astype(dtype_w).transpose((1, 0))).reshape(
                    (16, )) + bias_np
    print(gt)

    np.testing.assert_allclose(out_nd.asnumpy(), gt)
Beispiel #27
0
def test():
    env = nnpu.get_env()
    shape = (4, 16)
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, 'a')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)

    k = tvm.reduce_axis((0, 4), 'k')
    add_buf = tvm.compute((16, ), lambda i: tvm.sum(a_buf[k, i], axis=k),
                          'add_buf')
    sph.MarkScope(add_buf)
    add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph)

    # k1 = tvm.reduce_axis((0, 4), 'k1')
    # mul_buf = tvm.compute((16, ), lambda i: tvm.sum(a_buf[k1, i], axis=k1), 'mul_buf')
    # sph.MarkScope(mul_buf)
    # mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    k2 = tvm.reduce_axis((0, 4), 'k2')
    gtm_buf = tvm.compute((16, ), lambda i: tvm.max(a_buf[k2, i], axis=k2),
                          'gtm_buf')
    sph.MarkScope(gtm_buf)
    gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph)

    s = tvm.create_schedule([add_host.op, gtm_host.op])
    sph.Transform(s)

    ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1)
    s[add_buf].reorder(ko, ki, s[add_buf].op.axis[0])
    s[add_buf].tensorize(ki, env.intrins.get('VAddMerge', mode='w'))

    # ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1)
    # s[mul_buf].reorder(ko1, ki1, s[mul_buf].op.axis[0])
    # s[mul_buf].tensorize(ki1, env.intrins.get('VMulMerge', mode='w'))

    ko2, ki2 = s[gtm_buf].split(gtm_buf.op.reduce_axis[0], factor=1)
    s[gtm_buf].reorder(ko2, ki2, s[gtm_buf].op.axis[0])
    s[gtm_buf].tensorize(ki2, env.intrins.get('VGTMMerge', mode='w'))

    print(nnpu.lower(s, [a, add_host, gtm_host], simple_mode=True))

    func = nnpu.build(s, [a, add_host, gtm_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')
    #exit()
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=-16, high=16)
    a_nd = tvm.nd.array(a_np, ctx)

    add_nd = tvm.nd.array(np.zeros((16, )).astype(add_host.dtype), ctx)

    # mul_nd = tvm.nd.array(np.zeros((16,)).astype(mul_host.dtype), ctx)

    gtm_nd = tvm.nd.array(np.zeros((16, )).astype(gtm_host.dtype), ctx)

    print('------------------- device module 1 IR code: ')
    print(func.imported_modules[0].get_source('ir'))
    func(a_nd, add_nd, gtm_nd)

    print('a = ')
    print(a_np)
    print('reduce sum row = ')
    print(add_nd.asnumpy())
    print('ground truth is: ')
    gt = np.sum(a_np, axis=0)
    print(gt)
    np.testing.assert_allclose(add_nd.asnumpy(), gt)

    # print('reduce mul row = ')
    # print(mul_nd.asnumpy())
    # gt = np.multiply.reduce(a_np ,axis=0,dtype = a.dtype)
    # print(gt)
    # np.testing.assert_allclose(mul_nd.asnumpy(), gt)

    print('reduce max row = ')
    print(gtm_nd.asnumpy())
    gt = np.max(a_np, axis=0)
    print(gt)
    np.testing.assert_allclose(gtm_nd.asnumpy(), gt)
Beispiel #28
0
def test():
    pass
    if (False):
        print('-----')
    with ScheduleProcHelper():
        env = nnpu.get_env()

        shape = (16, 64)
        a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
        a_buf, _ = nnpu.utils.CopyHtoBuf(a_host, 'a')

        vctr_shape = (64, )
        b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host')
        b_buf, _ = nnpu.utils.CopyHtoBuf(b_host, 'b')

        dtype_w = env.cfg['dtype_w']

        out_shape = (4, 16)
        k = tvm.reduce_axis((0, 16), 'k')
        c_buf = tvm.compute(
            out_shape, lambda j, i: tvm.sum(a_buf[i, j * 16 + k].astype(
                dtype_w) * b_buf[j * 16 + k].astype(dtype_w),
                                            axis=k))
        utils.MarkScope(c_buf)
        c_host, _ = utils.CopyBufToH(c_buf, 'c')

        s = nnpu.create_schedule(c_host.op)

        # mark variable scopes

        # tensorize
        s[c_buf].tensorize(
            s[c_buf].op.axis[1],
            env.intrins.get('GEMM', shape=(16, 16, 1), mode='inc',
                            reduce=True))

        # build
        print(tvm.lower(s, [a_host, b_host, c_host], simple_mode=True))

        print(nnpu.lower(s, [a_host, b_host, c_host], simple_mode=True))
        #exit()
        func = nnpu.build(s, [a_host, b_host, c_host],
                          'nnpu',
                          'llvm',
                          name='nnpu_exp')

        print('function built: ')
        print('------------------- device module 1 asm code: ')
        print(func.imported_modules[0].get_source('asm'))
        #print(func.get_source())

        # prepare data
        ctx = tvm.nd.TVMContext(13, 0)

        a_np = np.random.randint(size=shape,
                                 dtype=a_host.dtype,
                                 low=-32,
                                 high=32)
        # a_np = np.ones(shape).astype(a_host.dtype)
        a_nd = tvm.nd.array(a_np, ctx)

        b_np = np.random.randint(size=vctr_shape,
                                 dtype=b_host.dtype,
                                 low=-16,
                                 high=16)
        # b_np = np.ones(vctr_shape).astype(b_host.dtype)
        b_nd = tvm.nd.array(b_np, ctx)

        out_nd = tvm.nd.array(np.zeros(out_shape).astype(c_host.dtype), ctx)

        # run
        func(a_nd, b_nd, out_nd)

        print('run finished')

        print('a=')
        print(a_np)
        print('b=')
        print(b_np)
        print('out=')
        out_np = out_nd.asnumpy()
        out_np = np.sum(out_np, axis=0)
        print(out_np)

        print('numpy ground truth is: ')
        gt = np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w))
        #gt = np.greater(np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)), bias_np)
        print(gt)

        np.testing.assert_allclose(out_np, gt)
Beispiel #29
0
def test_ib():
    print('aaaa')
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (16, )
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, name='a')
    w = shape[0]
    e = 16

    def build_nms_ir(ten_in, ten_out):
        ib = tvm.ir_builder.create()
        imm_value = 10
        ib.scope_attr(env.nnpu_axis, "coproc_scope", 0)
        p_in = ib.buffer_ptr(ten_in[0])
        p_out = ib.buffer_ptr(ten_out[0])
        #with ib.for_range(0,w, name="k") as k:
        with ib.for_range(0, w / e, name="i") as i:
            ib.emit(
                make_intrin_call(
                    "void", 'VAddI', ten_out[0].access_ptr("w", 'uint32') +
                    i * dtype_bytes(dtype_w),
                    ten_in[0].access_ptr("r", 'uint32') +
                    i * dtype_bytes(dtype_w), tvm.const(imm_value, 'float64'),
                    env.cfg['vector_unit']['size'], 3))
        stmt = ib.get()
        return stmt

    sph = ScheduleProcHelper()
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    sph.MarkScope(a_buf)
    out = tvm.extern(a_buf.shape, [a_buf],
                     build_nms_ir,
                     in_buffers=[
                         tvm.decl_buffer(a_buf.shape,
                                         dtype_w,
                                         data_alignment=dtype_bytes(dtype_w),
                                         scope='local.nnpu_scratchpad0')
                     ],
                     out_buffers=[
                         tvm.decl_buffer(a_buf.shape,
                                         dtype_w,
                                         data_alignment=dtype_bytes(dtype_w),
                                         scope='local.nnpu_scratchpad0')
                     ],
                     dtype=dtype_w,
                     name="test_ir")
    sph.MarkScope(out)
    out_host, out_dram = nnpu.utils.CopyBufToH(out, 'out', sph)
    s = tvm.create_schedule([out_host.op])
    sph.Transform(s)
    print(tvm.lower(s, [a, out_host], simple_mode=True))
    print(nnpu.lower(s, [a, out_host], simple_mode=True))
    # exit(0)
    func = nnpu.build(s, [a, out_host], 'nnpu', 'llvm', name='nnpu_test')
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(16, ), dtype=a.dtype, low=0, high=127)
    a_nd = tvm.nd.array(a_np, ctx)

    b_nd = tvm.nd.array(np.zeros(16, ).astype(out_host.dtype), ctx)

    func(a_nd, b_nd)

    print('a = ')
    print(a_np)
    print('xjb sum = ')
    print(b_nd.asnumpy())
    return
Beispiel #30
0
def test():
    env = nnpu.get_env()

    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    shape = (4, 16)
    a = tvm.placeholder(shape, dtype_n, 'a')
    b = tvm.placeholder((16, ), dtype_n, 'b')

    sph = ScheduleProcHelper()

    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b', sph)

    sum_buf = tvm.compute(shape, lambda i, j: a_buf[i, j] + b_buf[j],
                          'sum_buf')
    sph.MarkScope(sum_buf)
    sum_host, sum_dram = nnpu.utils.CopyBufToH(sum_buf, 'sum', sph)

    sub_buf = tvm.compute(shape, lambda i, j: a_buf[i, j] - b_buf[j],
                          'sub_buf')
    sph.MarkScope(sub_buf)
    sub_host, sub_dram = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph)

    mul_buf = tvm.compute(
        shape,
        lambda i, j: a_buf[i, j].astype(dtype_w) * b_buf[j].astype(dtype_w),
        'sub_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    s = tvm.create_schedule([sum_host.op, sub_host.op, mul_host.op])
    sph.Transform(s)
    s[sum_buf].tensorize(s[sum_buf].op.axis[0],
                         env.intrins.get('MAddV', shape=(4, 16), mode='n'))
    s[sub_buf].tensorize(s[sub_buf].op.axis[0],
                         env.intrins.get('MSubV', shape=(4, 16), mode='n'))
    s[mul_buf].tensorize(s[mul_buf].op.axis[0],
                         env.intrins.get('MMulV', shape=(4, 16), mode='inc'))

    print(nnpu.lower(s, [a, b, sum_host, sub_host, mul_host],
                     simple_mode=True))
    func = nnpu.build(s, [a, b, sum_host, sub_host, mul_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')

    print('------------------- device module 1 llvm IR: ')
    print(func.imported_modules[0].get_source('ll'))

    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(4, 16), dtype=a.dtype, low=0, high=64)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=(16, ), dtype=b.dtype, low=0, high=64)
    b_nd = tvm.nd.array(b_np, ctx)
    sum_nd = tvm.nd.array(np.zeros(shape).astype(sum_host.dtype), ctx)
    sub_nd = tvm.nd.array(np.zeros(shape).astype(sub_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros(shape).astype(mul_host.dtype), ctx)

    func(a_nd, b_nd, sum_nd, sub_nd, mul_nd)
    print('a = ')
    print(a_np)
    print('b = ')
    print(b_np)
    print('sum result is ')
    print(sum_nd.asnumpy())
    print("numpy ground truth is")
    gt = a_np + b_np
    print(gt)
    np.testing.assert_allclose(sum_nd.asnumpy(), gt)

    print('sub result is ')
    print(sub_nd.asnumpy())
    np.testing.assert_allclose(sub_nd.asnumpy(), a_np - b_np)

    print('mul result is ')
    print(mul_nd.asnumpy())
    np.testing.assert_allclose(mul_nd.asnumpy(), a_np.astype(dtype_w) * b_np)
    print('test passed')