Ejemplo n.º 1
0
 def check(dim, size, backward=False):
     n = 64
     a = jt.random((n, n, n, n))
     a.sync()
     m = n // size
     arr = []
     for i in range(m):
         arr.append(
             a.getitem((slice(None), ) * dim +
                       (slice(i * size, i * size + size), )))
     b = concat2(arr, dim)
     if backward:
         loss = b * a
         b = jt.grad(loss, a)
     with jt.profile_scope(1, 0) as rep:
         b.sync()
     # print(rep)
     i = rep[0].index("TotalTime")
     stime = 0
     for r in rep[1:]:
         stime += float(r[i])
     bw = 4 * 64**4 * 2 * 2 / stime
     # sizeof(float) * numel * (split and concat) * (read and write)
     print(f"{dim} {size} {stime/1e6}ms, {bw}GB/s")
     return bw
Ejemplo n.º 2
0
def test_case(box_num, out_size, time_limit):
    boxes = []
    for i in range(box_num):
        t = [
            random.random() * 0.9,
            random.random() * 0.9,
            random.random() * 0.9,
            random.random() * 0.9
        ]
        t2 = [
            min(t[0], t[2]),
            min(t[1], t[3]),
            max(t[0], t[2]) + 0.1,
            max(t[1], t[3]) + 0.1
        ]
        boxes.append(t2)
    img = jt.random([121, 121, 3])
    out = resize_and_crop(img,
                          jt.array(boxes),
                          interpolation='bilinear',
                          out_size=out_size)
    with jt.profile_scope() as rep:
        our_out = out.data
    t = 0
    fused_op_num = 0
    for i in range(1, len(rep)):
        t += float(rep[i][3]) / 1e9
        name = rep[i][0]
        if name.startswith('[') and (not '[graph:]' in name):
            fused_op_num += 1
    assert fused_op_num == 1, fused_op_num
    assert t <= time_limit, t
Ejemplo n.º 3
0
 def check(self, use_int32):
     n = 1024
     a = jt.random((n, n * 8))
     b = jt.random((n * 8, ))
     a.data, b.data
     with jt.profile_scope(compile_options={
             "compile_shapes": 1,
             "parallel": 1,
             "split1": n,
             "order1": 1
     },
                           try_use_32bit_index=use_int32) as rep:
         c = a - b
         # def func(a, b, c, tid, num):
         #     for i in range(tid*1024, 1024*8, num*1024):
         #         for j in range(n):
         #              for k in range(n):
         #                  c[j*1024*8 + i+k] = a[j*1024*8 + i+k] - b[i+k]
         nc = c.data
     assert len(rep) == 2
     assert (a.data - b.data == nc).all()
     fname = rep[1][1]
     with open(fname) as f:
         src = f.read()
         assert "thread_id" in src
     with open(fname.replace(".cc", ".s")) as f:
         asm = SimpleAsmParser(f.read())
     func_name = "run"
     ca = asm.count_instructions(func_name, "vmova")
     cu = asm.count_instructions(func_name, "vmovu")
     return ca, cu
Ejemplo n.º 4
0
 def check(self, use_int32):
     n = 1024
     a = jt.random((n, n))
     b = jt.random((n, n))
     a.data, b.data
     with jt.profile_scope(compile_options={
             "compile_shapes": 1,
             "parallel": 2,
             "try_use_32bit_index": use_int32
     },
                           try_use_32bit_index=use_int32) as rep:
         c = a + b
         nc = c.data
     assert len(rep) == 2
     assert (a.data + b.data == nc).all()
     fname = rep[1][1]
     with open(fname) as f:
         src = f.read()
         assert "thread_id" in src
     with open(fname.replace(".cc", ".s")) as f:
         asm = SimpleAsmParser(f.read())
     func_name = "run"
     ca = asm.count_instructions(func_name, "vmova")
     cu = asm.count_instructions(func_name, "vmovu")
     return ca, cu
Ejemplo n.º 5
0
 def reduce_check(self, ndim, depth, tdim, rdim, has_atomic, order=[], split=[], **args):
     shape = [8]*ndim
     a = jt.random(shape)
     a.sync()
     config = {
         "parallel":1, "max_parallel_depth":depth, "merge_loop_var": self.merge_loop_var
     }
     for k in args:
         config[k] = args[k]
     if not isinstance(rdim, list):
         rdim = [rdim]
     rdim = tuple(rdim)
     nshape = [1024, 256, 128][len(rdim)]
     for d in rdim: shape[d] = nshape
     for i,o in enumerate(order):
         config[f"order{i}"] = o
     for i,o in enumerate(split):
         config[f"split{i}"] = o
     with jt.profile_scope(
         compile_options = config,
         enable_tuner = 0
     ) as rep:
         b = a.sum(rdim).data
     assert len(rep) == 2
     fname = rep[1][1]
     with open(fname) as f:
         src = f.read()
         for i in range(tdim):
             assert f"tnum{i}" in src
         assert f"tnum{tdim}" not in src, f"tnum{tdim}"
         src_has_atomic = "atomic_add" in src or "atomicAdd" in src
         assert has_atomic == src_has_atomic
     assert np.allclose(a.data.sum(rdim), b), (b.sum(), a.data.sum())
Ejemplo n.º 6
0
 def test_fuse_transpose5(self):
     with jt.profile_scope() as rep:
         a = jt.rand((10, 11, 6, 7))
         c = jt.rand((10, 11, 6, 7))
         b = (a + c).fuse_transpose((1, 0, 2, 3))
         np.testing.assert_allclose((a.data + c.data).transpose(
             (1, 0, 2, 3)), b.data)
     assert len(rep) == 3
Ejemplo n.º 7
0
 def test_fuse_transpose3(self):
     with jt.profile_scope() as rep:
         a = jt.rand((10, 11, 12))
         c = jt.rand((11, 12, 10))
         b = a.fuse_transpose((1, 2, 0)) + c
         np.testing.assert_allclose(
             a.data.transpose((1, 2, 0)) + c.data, b.data)
     assert len(rep) == 3
 def test2(self):
     a = jt.ones([10, 10, 10, 10])
     a.sync()
     with jt.profile_scope() as rep:
         b = a + 1
         b.sync()
     with open(rep[1][1]) as f:
         src = f.read()
         assert "range0123" in src
Ejemplo n.º 9
0
    def test_resnet_train_profile(self):
        with jt.profile_scope(trace_py_var=1):

            resnet18 = resnet.Resnet18()
            opt = jt.optim.SGD(resnet18.parameters(), 0.1)
            x = jt.float32(np.random.rand(2, 3, 224, 224))
            y = resnet18(x)

            opt.step(y**2)
            jt.sync_all()
Ejemplo n.º 10
0
 def test_scalar_fuse_unary(self):
     with jt.profile_scope() as rep:
         a = jt.array([1])
         b = -a
         a = a.clone()
         b = b.clone()
         jt.sync([a, b])
         assert a.data == 1
         assert b.data == -1
     assert len(rep) == 2
 def test5(self):
     a = jt.ones([10, 10, 10, 10])
     a.sync()
     with jt.profile_scope() as rep:
         b = a.sum([1])
         b.sync()
     with open(rep[1][1]) as f:
         src = f.read()
         assert "range01" not in src
         assert "range23" in src
 def test4(self):
     # don't optimize reindex like op yet
     a = jt.ones([10, 10, 10, 10])
     a.sync()
     with jt.profile_scope() as rep:
         b = a.reindex_reduce("add", [10, 10], ["i0", "i1"])
         b.sync()
     with open(rep[1][1]) as f:
         src = f.read()
         assert "range23" not in src
 def test3(self):
     a = jt.ones([10, 10, 10, 10])
     x = jt.ones([1, 10, 1, 1])
     a.sync(), x.sync()
     with jt.profile_scope() as rep:
         b = a + x
         b.sync()
     with open(rep[1][1]) as f:
         src = f.read()
         assert "range23" in src
 def setUpClass(self):
     n, m, k = 2, 6, 16
     a = jt.random((n, m, 1))
     b = jt.random((1, m, k))
     jt.fetch_sync([a, b])
     with jt.profile_scope(compile_options={"jtune": 1}) as rep:
         c = (a * b).sum(1)
         c.sync()
     assert len(rep) == 2
     self.fname = rep[1][1]
     self.jtune_path = os.path.join(jt.flags.jittor_path, "utils/jtune.py")
 def test(self):
     a = jt.array([1,2,3])
     a.sync()
     assert a.compile_options=={}
     a.compile_options = {"compile_shapes":1}
     assert a.compile_options=={"compile_shapes":1}
     b = a+a
     assert b.compile_options=={}
     with jt.flag_scope(compile_options={"compile_shapes":1}):
         c = a+b
     assert c.compile_options=={"compile_shapes":1}
     with jt.profile_scope() as report:
         c.sync()
     assert len(report)==2 and "compile_shapes:1" in report[1][0]
Ejemplo n.º 16
0
 def check(ndim, depth, tdim):
     a = jt.random([16] * ndim)
     a.sync()
     compile_options = {"parallel": 1}
     if depth is not None:
         compile_options["max_parallel_depth"] = depth
     with jt.profile_scope(compile_options=compile_options) as rep:
         b = (a + a).data
     assert np.allclose(a.data * 2, b)
     assert len(rep) == 2
     fname = rep[1][1]
     with open(fname) as f:
         src = f.read()
         for i in range(tdim):
             assert f"tnum{i}" in src
         assert f"tnum{tdim}" not in src
Ejemplo n.º 17
0
    def test_stop_fuse2(self):
        with jt.profile_scope() as report:
            a = jt.float32(0).stop_fuse()
            c = jt.float32(0).stop_fuse()
            bs = [c]
            for i in range(2000):
                b = jt.float32(i) * 2 * c
                bs.append(b)
                a += b

            a = a * 2

            dbs = jt.grad(a, bs)
            jt.sync(dbs + [a])

        for a in report[1:]:
            assert len(a[0].split("opkey")) < 8
    def test_stop_fuse(self):
        with jt.profile_scope() as report:
            a = jt.float32(0).stop_fuse()
            c = jt.float32(0)
            bs = [c]
            for i in range(2000):
                b = jt.float32(i) * 2 * c
                bs.append(b)
                a += b

            a = a * 2

            dbs = jt.grad(a, bs)
            jt.sync(dbs + [a])

        for a in report[1:]:
            # origin is 50
            # after update queue, increase to 102
            assert len(a[0].split("opkey")) < 110, len(a[0].split("opkey"))
Ejemplo n.º 19
0
    def test_reduce_opt(self):
        a = jt.random((16, 512, 38, 38))
        b = jt.random((16, 512, 38, 38))
        jt.sync([a, b])
        with jt.profile_scope(rerun=10, warmup=10) as rep:
            norm = a.sqr().sum(1, keepdims=True).sqrt()
            c = a / norm
            da = jt.grad(c * b, a)
            jt.sync([c, da])
        gpu_c = c.numpy()
        gpu_da = da.numpy()
        with jt.flag_scope(use_cuda=0):
            norm = a.sqr().sum(1, keepdims=True).sqrt()
            c = a / norm
            da = jt.grad(c * b, a)
            assert np.allclose(gpu_c, c.data, 1e-3)
            assert (np.abs(gpu_da - da.data).max() < 1e-6)

        assert float(rep[1][3]) < 15e6, float(rep[1][3])  # 15ms(about 8ms)
Ejemplo n.º 20
0
        def check(n, m, reduce_dim, cache_report_, error_rate_threshold):
            a = jt.random([n, m])
            a.sync()
            with jt.profile_scope(
                    compile_options={
                        "check_cache": 1,
                        "replace_strategy": 1,
                        "page_size": 4 << 10,  #2 << 20
                        "vtop": 0,
                        "tlb_size": 64,
                        "tlb_ways": 4,
                        "tlb_line_size": 1,
                        "L1_size": 32 << 10,
                        "L1_ways": 8,
                        "L1_line_size": 64,
                        "L2_size": 256 << 10,
                        "L2_ways": 8,
                        "L2_line_size": 64,
                        "L3_size": 15 << 20,
                        "L3_ways": 20,
                        "L3_line_size": 64
                    },
                    enable_tuner=0) as report:
                c = a.sum(reduce_dim)
                c.sync()

            check_cache_code(report[1][1])
            cache_report = report[-1][-5:]
            for i in range(len(cache_report)):
                cache_report[i] = int(cache_report[i])
            for i in range(len(cache_report)):
                assert abs(cache_report[i] - cache_report_[i]) <= int(
                    cache_report_[i] *
                    error_rate_threshold), "cache report error: " + report[-2][
                        -(len(cache_report) - i)] + " error, " + str(
                            cache_report[i]) + "!=" + str(cache_report_[i])
Ejemplo n.º 21
0
def test(name, model_name, bs):
    print("hello", name, model_name, bs)
    import numpy as np
    import time
    is_train = False
    _model_name = model_name
    if model_name.startswith("train_"):
        is_train = True
        model_name = model_name[6:]
    if name == "torch":
        import torch
        import torchvision.models as tcmodels
        from torch import optim
        from torch import nn
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True
        model = tcmodels.__dict__[model_name]()
        model = model.cuda()
    else:
        import jittor as jt
        from jittor import optim
        from jittor import nn
        jt.flags.use_cuda = 1
        jt.cudnn.set_algorithm_cache_size(10000)
        import jittor.models as jtmodels
        model = jtmodels.__dict__[model_name]()
        if (model == "resnet152" or model == "resnet101") and bs == 128 and is_train:
            jt.cudnn.set_max_workspace_ratio(0.05)
    if is_train:
        model.train()
    else:
        model.eval()
    img_size = 224
    if model_name == "inception_v3":
        img_size = 300
    test_img = np.random.random((bs, 3, img_size, img_size)).astype("float32")
    if is_train:
        label = (np.random.random((bs,)) * 1000).astype("int32")
    if name == "torch":
        test_img = torch.Tensor(test_img).cuda()
        if is_train:
            label = torch.LongTensor(label).cuda()
            opt = optim.SGD(model.parameters(), 0.001)
        sync = lambda: torch.cuda.synchronize()
        jt = torch
    else:
        test_img = jt.array(test_img).stop_grad()
        if is_train:
            label = jt.array(label).stop_grad()
            opt = optim.SGD(model.parameters(), 0.001)
        sync = lambda: jt.sync_all(True)

    sync()
    use_profiler = os.environ.get("use_profiler", "0") == "1"
    if hasattr(jt, "nograd"):
        ng = jt.no_grad()
        ng.__enter__()
    def iter():
        x = model(test_img)
        if isinstance(x, tuple):
            x = x[0]
        if is_train:
            loss = nn.CrossEntropyLoss()(x, label)
            if name == "jittor":
                opt.step(loss)
            else:
                opt.zero_grad()
                loss.backward()
                opt.step()
        else:
            x.sync()
    sync()
    for i in time_iter():
        iter()
    sync()
    for i in time_iter():
        iter()
    sync()
    if use_profiler:
        if name == "torch":
            prof = torch.autograd.profiler.profile(use_cuda=True)
        else:
            prof = jt.profile_scope()
        prof.__enter__()
    if name == "jittor":
        if hasattr(jt.flags, "use_parallel_op_compiler"):
            jt.flags.use_parallel_op_compiler = 0
    start = time.time()
    for i in time_iter(10):
        iter()
    sync()
    end = time.time()
    if use_profiler:
        prof.__exit__(None,None,None)
        if name == "torch":
            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
    total_iter = i+1
    print("duration:", end-start, "FPS:", total_iter*bs/(end-start))
    fpath = f"{home_path}/.cache/jittor/{name}-{_model_name}-{bs}.txt"
    with open(fpath, 'w') as f:
        f.write(f"duration: {end-start} FPS: {total_iter*bs/(end-start)}")
    os.chmod(fpath, 0x666)
Ejemplo n.º 22
0
 def test_fuse_transpose2(self):
     with jt.profile_scope() as rep:
         a = jt.rand((10, 11, 12))
         b = (a + 1).fuse_transpose((1, 2, 0))
         np.testing.assert_allclose(a.data.transpose((1, 2, 0)) + 1, b.data)
     assert len(rep) == 3