def check(dim, size, backward=False): n = 64 a = jt.random((n, n, n, n)) a.sync() m = n // size arr = [] for i in range(m): arr.append( a.getitem((slice(None), ) * dim + (slice(i * size, i * size + size), ))) b = concat2(arr, dim) if backward: loss = b * a b = jt.grad(loss, a) with jt.profile_scope(1, 0) as rep: b.sync() # print(rep) i = rep[0].index("TotalTime") stime = 0 for r in rep[1:]: stime += float(r[i]) bw = 4 * 64**4 * 2 * 2 / stime # sizeof(float) * numel * (split and concat) * (read and write) print(f"{dim} {size} {stime/1e6}ms, {bw}GB/s") return bw
def test_case(box_num, out_size, time_limit): boxes = [] for i in range(box_num): t = [ random.random() * 0.9, random.random() * 0.9, random.random() * 0.9, random.random() * 0.9 ] t2 = [ min(t[0], t[2]), min(t[1], t[3]), max(t[0], t[2]) + 0.1, max(t[1], t[3]) + 0.1 ] boxes.append(t2) img = jt.random([121, 121, 3]) out = resize_and_crop(img, jt.array(boxes), interpolation='bilinear', out_size=out_size) with jt.profile_scope() as rep: our_out = out.data t = 0 fused_op_num = 0 for i in range(1, len(rep)): t += float(rep[i][3]) / 1e9 name = rep[i][0] if name.startswith('[') and (not '[graph:]' in name): fused_op_num += 1 assert fused_op_num == 1, fused_op_num assert t <= time_limit, t
def check(self, use_int32): n = 1024 a = jt.random((n, n * 8)) b = jt.random((n * 8, )) a.data, b.data with jt.profile_scope(compile_options={ "compile_shapes": 1, "parallel": 1, "split1": n, "order1": 1 }, try_use_32bit_index=use_int32) as rep: c = a - b # def func(a, b, c, tid, num): # for i in range(tid*1024, 1024*8, num*1024): # for j in range(n): # for k in range(n): # c[j*1024*8 + i+k] = a[j*1024*8 + i+k] - b[i+k] nc = c.data assert len(rep) == 2 assert (a.data - b.data == nc).all() fname = rep[1][1] with open(fname) as f: src = f.read() assert "thread_id" in src with open(fname.replace(".cc", ".s")) as f: asm = SimpleAsmParser(f.read()) func_name = "run" ca = asm.count_instructions(func_name, "vmova") cu = asm.count_instructions(func_name, "vmovu") return ca, cu
def check(self, use_int32): n = 1024 a = jt.random((n, n)) b = jt.random((n, n)) a.data, b.data with jt.profile_scope(compile_options={ "compile_shapes": 1, "parallel": 2, "try_use_32bit_index": use_int32 }, try_use_32bit_index=use_int32) as rep: c = a + b nc = c.data assert len(rep) == 2 assert (a.data + b.data == nc).all() fname = rep[1][1] with open(fname) as f: src = f.read() assert "thread_id" in src with open(fname.replace(".cc", ".s")) as f: asm = SimpleAsmParser(f.read()) func_name = "run" ca = asm.count_instructions(func_name, "vmova") cu = asm.count_instructions(func_name, "vmovu") return ca, cu
def reduce_check(self, ndim, depth, tdim, rdim, has_atomic, order=[], split=[], **args): shape = [8]*ndim a = jt.random(shape) a.sync() config = { "parallel":1, "max_parallel_depth":depth, "merge_loop_var": self.merge_loop_var } for k in args: config[k] = args[k] if not isinstance(rdim, list): rdim = [rdim] rdim = tuple(rdim) nshape = [1024, 256, 128][len(rdim)] for d in rdim: shape[d] = nshape for i,o in enumerate(order): config[f"order{i}"] = o for i,o in enumerate(split): config[f"split{i}"] = o with jt.profile_scope( compile_options = config, enable_tuner = 0 ) as rep: b = a.sum(rdim).data assert len(rep) == 2 fname = rep[1][1] with open(fname) as f: src = f.read() for i in range(tdim): assert f"tnum{i}" in src assert f"tnum{tdim}" not in src, f"tnum{tdim}" src_has_atomic = "atomic_add" in src or "atomicAdd" in src assert has_atomic == src_has_atomic assert np.allclose(a.data.sum(rdim), b), (b.sum(), a.data.sum())
def test_fuse_transpose5(self): with jt.profile_scope() as rep: a = jt.rand((10, 11, 6, 7)) c = jt.rand((10, 11, 6, 7)) b = (a + c).fuse_transpose((1, 0, 2, 3)) np.testing.assert_allclose((a.data + c.data).transpose( (1, 0, 2, 3)), b.data) assert len(rep) == 3
def test_fuse_transpose3(self): with jt.profile_scope() as rep: a = jt.rand((10, 11, 12)) c = jt.rand((11, 12, 10)) b = a.fuse_transpose((1, 2, 0)) + c np.testing.assert_allclose( a.data.transpose((1, 2, 0)) + c.data, b.data) assert len(rep) == 3
def test2(self): a = jt.ones([10, 10, 10, 10]) a.sync() with jt.profile_scope() as rep: b = a + 1 b.sync() with open(rep[1][1]) as f: src = f.read() assert "range0123" in src
def test_resnet_train_profile(self): with jt.profile_scope(trace_py_var=1): resnet18 = resnet.Resnet18() opt = jt.optim.SGD(resnet18.parameters(), 0.1) x = jt.float32(np.random.rand(2, 3, 224, 224)) y = resnet18(x) opt.step(y**2) jt.sync_all()
def test_scalar_fuse_unary(self): with jt.profile_scope() as rep: a = jt.array([1]) b = -a a = a.clone() b = b.clone() jt.sync([a, b]) assert a.data == 1 assert b.data == -1 assert len(rep) == 2
def test5(self): a = jt.ones([10, 10, 10, 10]) a.sync() with jt.profile_scope() as rep: b = a.sum([1]) b.sync() with open(rep[1][1]) as f: src = f.read() assert "range01" not in src assert "range23" in src
def test4(self): # don't optimize reindex like op yet a = jt.ones([10, 10, 10, 10]) a.sync() with jt.profile_scope() as rep: b = a.reindex_reduce("add", [10, 10], ["i0", "i1"]) b.sync() with open(rep[1][1]) as f: src = f.read() assert "range23" not in src
def test3(self): a = jt.ones([10, 10, 10, 10]) x = jt.ones([1, 10, 1, 1]) a.sync(), x.sync() with jt.profile_scope() as rep: b = a + x b.sync() with open(rep[1][1]) as f: src = f.read() assert "range23" in src
def setUpClass(self): n, m, k = 2, 6, 16 a = jt.random((n, m, 1)) b = jt.random((1, m, k)) jt.fetch_sync([a, b]) with jt.profile_scope(compile_options={"jtune": 1}) as rep: c = (a * b).sum(1) c.sync() assert len(rep) == 2 self.fname = rep[1][1] self.jtune_path = os.path.join(jt.flags.jittor_path, "utils/jtune.py")
def test(self): a = jt.array([1,2,3]) a.sync() assert a.compile_options=={} a.compile_options = {"compile_shapes":1} assert a.compile_options=={"compile_shapes":1} b = a+a assert b.compile_options=={} with jt.flag_scope(compile_options={"compile_shapes":1}): c = a+b assert c.compile_options=={"compile_shapes":1} with jt.profile_scope() as report: c.sync() assert len(report)==2 and "compile_shapes:1" in report[1][0]
def check(ndim, depth, tdim): a = jt.random([16] * ndim) a.sync() compile_options = {"parallel": 1} if depth is not None: compile_options["max_parallel_depth"] = depth with jt.profile_scope(compile_options=compile_options) as rep: b = (a + a).data assert np.allclose(a.data * 2, b) assert len(rep) == 2 fname = rep[1][1] with open(fname) as f: src = f.read() for i in range(tdim): assert f"tnum{i}" in src assert f"tnum{tdim}" not in src
def test_stop_fuse2(self): with jt.profile_scope() as report: a = jt.float32(0).stop_fuse() c = jt.float32(0).stop_fuse() bs = [c] for i in range(2000): b = jt.float32(i) * 2 * c bs.append(b) a += b a = a * 2 dbs = jt.grad(a, bs) jt.sync(dbs + [a]) for a in report[1:]: assert len(a[0].split("opkey")) < 8
def test_stop_fuse(self): with jt.profile_scope() as report: a = jt.float32(0).stop_fuse() c = jt.float32(0) bs = [c] for i in range(2000): b = jt.float32(i) * 2 * c bs.append(b) a += b a = a * 2 dbs = jt.grad(a, bs) jt.sync(dbs + [a]) for a in report[1:]: # origin is 50 # after update queue, increase to 102 assert len(a[0].split("opkey")) < 110, len(a[0].split("opkey"))
def test_reduce_opt(self): a = jt.random((16, 512, 38, 38)) b = jt.random((16, 512, 38, 38)) jt.sync([a, b]) with jt.profile_scope(rerun=10, warmup=10) as rep: norm = a.sqr().sum(1, keepdims=True).sqrt() c = a / norm da = jt.grad(c * b, a) jt.sync([c, da]) gpu_c = c.numpy() gpu_da = da.numpy() with jt.flag_scope(use_cuda=0): norm = a.sqr().sum(1, keepdims=True).sqrt() c = a / norm da = jt.grad(c * b, a) assert np.allclose(gpu_c, c.data, 1e-3) assert (np.abs(gpu_da - da.data).max() < 1e-6) assert float(rep[1][3]) < 15e6, float(rep[1][3]) # 15ms(about 8ms)
def check(n, m, reduce_dim, cache_report_, error_rate_threshold): a = jt.random([n, m]) a.sync() with jt.profile_scope( compile_options={ "check_cache": 1, "replace_strategy": 1, "page_size": 4 << 10, #2 << 20 "vtop": 0, "tlb_size": 64, "tlb_ways": 4, "tlb_line_size": 1, "L1_size": 32 << 10, "L1_ways": 8, "L1_line_size": 64, "L2_size": 256 << 10, "L2_ways": 8, "L2_line_size": 64, "L3_size": 15 << 20, "L3_ways": 20, "L3_line_size": 64 }, enable_tuner=0) as report: c = a.sum(reduce_dim) c.sync() check_cache_code(report[1][1]) cache_report = report[-1][-5:] for i in range(len(cache_report)): cache_report[i] = int(cache_report[i]) for i in range(len(cache_report)): assert abs(cache_report[i] - cache_report_[i]) <= int( cache_report_[i] * error_rate_threshold), "cache report error: " + report[-2][ -(len(cache_report) - i)] + " error, " + str( cache_report[i]) + "!=" + str(cache_report_[i])
def test(name, model_name, bs): print("hello", name, model_name, bs) import numpy as np import time is_train = False _model_name = model_name if model_name.startswith("train_"): is_train = True model_name = model_name[6:] if name == "torch": import torch import torchvision.models as tcmodels from torch import optim from torch import nn torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True model = tcmodels.__dict__[model_name]() model = model.cuda() else: import jittor as jt from jittor import optim from jittor import nn jt.flags.use_cuda = 1 jt.cudnn.set_algorithm_cache_size(10000) import jittor.models as jtmodels model = jtmodels.__dict__[model_name]() if (model == "resnet152" or model == "resnet101") and bs == 128 and is_train: jt.cudnn.set_max_workspace_ratio(0.05) if is_train: model.train() else: model.eval() img_size = 224 if model_name == "inception_v3": img_size = 300 test_img = np.random.random((bs, 3, img_size, img_size)).astype("float32") if is_train: label = (np.random.random((bs,)) * 1000).astype("int32") if name == "torch": test_img = torch.Tensor(test_img).cuda() if is_train: label = torch.LongTensor(label).cuda() opt = optim.SGD(model.parameters(), 0.001) sync = lambda: torch.cuda.synchronize() jt = torch else: test_img = jt.array(test_img).stop_grad() if is_train: label = jt.array(label).stop_grad() opt = optim.SGD(model.parameters(), 0.001) sync = lambda: jt.sync_all(True) sync() use_profiler = os.environ.get("use_profiler", "0") == "1" if hasattr(jt, "nograd"): ng = jt.no_grad() ng.__enter__() def iter(): x = model(test_img) if isinstance(x, tuple): x = x[0] if is_train: loss = nn.CrossEntropyLoss()(x, label) if name == "jittor": opt.step(loss) else: opt.zero_grad() loss.backward() opt.step() else: x.sync() sync() for i in time_iter(): iter() sync() for i in time_iter(): iter() sync() if use_profiler: if name == "torch": prof = torch.autograd.profiler.profile(use_cuda=True) else: prof = jt.profile_scope() prof.__enter__() if name == "jittor": if hasattr(jt.flags, "use_parallel_op_compiler"): jt.flags.use_parallel_op_compiler = 0 start = time.time() for i in time_iter(10): iter() sync() end = time.time() if use_profiler: prof.__exit__(None,None,None) if name == "torch": print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30)) total_iter = i+1 print("duration:", end-start, "FPS:", total_iter*bs/(end-start)) fpath = f"{home_path}/.cache/jittor/{name}-{_model_name}-{bs}.txt" with open(fpath, 'w') as f: f.write(f"duration: {end-start} FPS: {total_iter*bs/(end-start)}") os.chmod(fpath, 0x666)
def test_fuse_transpose2(self): with jt.profile_scope() as rep: a = jt.rand((10, 11, 12)) b = (a + 1).fuse_transpose((1, 2, 0)) np.testing.assert_allclose(a.data.transpose((1, 2, 0)) + 1, b.data) assert len(rep) == 3