def deep_hierarchy(scale): branching = 4 num_levels = 8 + int(math.log(scale, branching)) x = ti.field(dtype=ti.f32) n = 256 * 1024 * scale assert n % (branching**num_levels) == 0 snode = ti.root for i in range(num_levels): snode = snode.pointer(ti.i, branching) snode.dense(ti.i, n // (branching**num_levels)).place(x) @ti.kernel def initialize(): for i in range(n): x[i] = 0 initialize() # Not fusible, but no modification to the mask/list of x either @ti.kernel def jitter(): for i in x: if i % 2 == 0: x[i] += x[i + 1] def task(): for i in range(5): jitter() ti.benchmark(task, repeat=5)
def stencil_reduction(): a = ti.field(dtype=ti.f32) b = ti.field(dtype=ti.f32) total = ti.field(dtype=ti.f32, shape=()) block_count = 1024 block_size = 1024 # a, b always share the same sparsity ti.root.pointer(ti.i, block_count).dense(ti.i, block_size).place(a, b) @ti.kernel def initialize(): for i in range(block_size, (block_size - 1) * block_count): a[i] = i @ti.kernel def stencil(): for i in a: b[i] = a[i - 1] + a[i] + a[i + 1] @ti.kernel def reduce(): for i in a: total[None] += b[i] def task(): for i in range(2): initialize() stencil() reduce() ti.benchmark(task, repeat=100)
def multires(scale): num_levels = 4 x = [] for i in range(num_levels): x.append(ti.field(dtype=ti.f32)) # TODO: Using 1024 instead of 512 hangs the CUDA case. Need to figure out why. n = 512 * 1024 * scale block_size = 16 assert n % block_size**2 == 0 for i in range(num_levels): ti.root.pointer(ti.i, n // 2**i // block_size**2).pointer( ti.i, block_size).dense(ti.i, block_size).place(x[i]) @ti.kernel def initialize(): for i in range(n): x[0][i] = i @ti.kernel def downsample(l: ti.template()): for i in x[l]: if i % 2 == 0: x[l + 1][i // 2] = x[l][i] initialize() def task(): for l in range(num_levels - 1): downsample(l) ti.benchmark(task, repeat=5)
def sparse_numpy(scale): import math a = ti.field(dtype=ti.f32) b = ti.field(dtype=ti.f32) block_count = 2**int((math.log(scale, 2)) // 2) * 64 block_size = 32 # a, b always share the same sparsity ti.root.pointer(ti.ij, block_count).dense(ti.ij, block_size).place(a, b) @ti.kernel def initialize(): for i, j in ti.ndrange(block_count * block_size, block_count * block_size): if (i // block_size + j // block_size) % 4 == 0: a[i, j] = i + j @ti.kernel def saxpy(x: ti.template(), y: ti.template(), alpha: ti.f32): for i, j in x: y[i, j] = alpha * x[i, j] + y[i, j] def task(): initialize() saxpy(a, b, 2) saxpy(b, a, 1.1) saxpy(b, a, 1.1) saxpy(a, b, 1.1) saxpy(a, b, 1.1) saxpy(a, b, 1.1) ti.benchmark(task, repeat=10)
def autodiff(scale): n = 1024**2 * scale a = ti.field(dtype=ti.f32, shape=n, needs_grad=True) b = ti.field(dtype=ti.f32, shape=n) loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True) @ti.kernel def compute_loss(): for i in a: loss[None] += a[i] @ti.kernel def accumulate_grad(): for i in a: b[i] += a.grad[i] def task(): with ti.Tape(loss=loss): # The forward kernel of compute_loss should be completely eliminated (except for the last one) compute_loss() accumulate_grad() ti.benchmark(task, repeat=100)
def fill_scalar(): a = ti.field(dtype=ti.f32, shape=()) @ti.kernel def fill(): a[None] = 1.0 ti.benchmark(fill, repeat=1000)
def fill_1d(): a = ti.field(dtype=ti.f32, shape=100 * 1024**2) @ti.kernel def fill(): for i in a: a[i] = 1.0 ti.benchmark(fill, repeat=100)
def template_fuse_dense_x2y2z(size=1024**3, repeat=10, first_n=100, benchmark=0, benchmark_repeat=50): x = ti.field(ti.i32, shape=(size, )) y = ti.field(ti.i32, shape=(size, )) z = ti.field(ti.i32, shape=(size, )) first_n = min(first_n, size) @ti.kernel def x_to_y(): for i in x: y[i] = x[i] + 1 @ti.kernel def y_to_z(): for i in x: z[i] = y[i] + 4 def x_to_y_to_z(): x_to_y() y_to_z() for i in range(first_n): x[i] = i * 10 if benchmark: ti.benchmark(x_to_y_to_z, repeat=benchmark_repeat) else: # Simply test for _ in range(repeat): t = time.time() x_to_y() ti.sync() print('x_to_y', time.time() - t) for _ in range(repeat): t = time.time() y_to_z() ti.sync() print('y_to_z', time.time() - t) for _ in range(repeat): t = time.time() x_to_y_to_z() ti.sync() print('fused x->y->z', time.time() - t) for i in range(first_n): assert x[i] == i * 10 assert y[i] == x[i] + 1 assert z[i] == x[i] + 5
def fill_scalar(scale): a = ti.field(dtype=ti.f32, shape=()) @ti.kernel def fill(): a[None] = 1.0 def repeated_fill(): for _ in range(1000): fill() ti.benchmark(repeated_fill, repeat=5)
def fill_1d(scale): a = ti.field(dtype=ti.f32, shape=scale * 10 * 1024**2) @ti.kernel def fill(): for i in a: a[i] = 1.0 def repeated_fill(): for _ in range(10): fill() ti.benchmark(repeated_fill, repeat=10)
def template_fuse_reduction(size=1024**3, repeat=10, first_n=100, benchmark=0, benchmark_repeat=50): x = ti.field(ti.i32, shape=(size, )) first_n = min(first_n, size) @ti.kernel def reset(): for i in range(first_n): x[i] = i * 10 @ti.kernel def inc(): for i in x: x[i] = x[i] + 1 if benchmark: def repeated_inc(): for _ in range(repeat): inc() ti.benchmark(repeated_inc, repeat=benchmark_repeat) else: # Simply test reset() ti.sync() for _ in range(repeat): t = time.time() inc() ti.sync() print('single inc', time.time() - t) reset() ti.sync() t = time.time() for _ in range(repeat): inc() ti.sync() duration = time.time() - t print( f'fused {repeat} inc: total={duration} average={duration / repeat}' ) for i in range(first_n): assert x[i] == i * 10 + repeat
def benchmark_fill_scalar(): a = ti.var(dt=ti.f32, shape=()) @ti.kernel def fill(): a[None] = 1.0 return ti.benchmark(fill)
def benchmark_memset(): a = ti.var(dt=ti.f32, shape=N) @ti.kernel def memset(): for i in a: a[i] = 1.0 return ti.benchmark(memset, repeat=10)
def benchmark_sscal(): a = ti.var(dt=ti.f32, shape=N) @ti.kernel def task(): for i in a: a[i] = 0.5 * a[i] return ti.benchmark(task, repeat=10)
def benchmark_memcpy(): a = ti.var(dt=ti.f32, shape=N) b = ti.var(dt=ti.f32, shape=N) @ti.kernel def memcpy(): for i in a: a[i] = b[i] return ti.benchmark(memcpy, repeat=10)
def benchmark_flat_struct(): N = 4096 a = ti.field(dtype=ti.f32, shape=(N, N)) @ti.kernel def fill(): for i, j in a: a[i, j] = 2.0 return ti.benchmark(fill, repeat=500)
def benchmark_flat_range(): N = 4096 a = ti.field(dtype=ti.f32, shape=(N, N)) @ti.kernel def fill(): for i, j in ti.ndrange(N, N): a[i, j] = 2.0 return ti.benchmark(fill, repeat=700)
def benchmark_flat_range(): N = 4096 a = ti.var(dt=ti.f32, shape=(N, N)) @ti.kernel def fill(): for i, j in ti.ndrange(N, N): a[i, j] = 2.0 return ti.benchmark(fill)
def benchmark_flat_struct(): N = 4096 a = ti.var(dt=ti.f32, shape=(N, N)) @ti.kernel def fill(): for i, j in a: a[i, j] = 2.0 return ti.benchmark(fill)
def benchmark_nested_struct(): a = ti.field(dtype=ti.f32) N = 512 ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for i, j in a: a[i, j] = 2.0 return ti.benchmark(fill, repeat=700)
def benchmark_saxpy(): x = ti.var(dt=ti.f32, shape=N) y = ti.var(dt=ti.f32, shape=N) z = ti.var(dt=ti.f32, shape=N) @ti.kernel def task(): for i in x: a = 123 z[i] = a * x[i] + y[i] return ti.benchmark(task, repeat=10)
def benchmark_nested_struct_listgen_8x8(): a = ti.var(dt=ti.f32) ti.cfg.demote_dense_struct_fors = False N = 512 ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for i, j in a: a[i, j] = 2.0 return ti.benchmark(fill, repeat=1000)
def benchmark_root_listgen(): a = ti.field(dtype=ti.f32) ti.cfg.demote_dense_struct_fors = False N = 512 ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for i, j in a.parent(): a[i, j] = 2.0 return ti.benchmark(fill, repeat=800)
def benchmark_nested_struct_listgen_16x16(): a = ti.field(dtype=ti.f32) ti.cfg.demote_dense_struct_fors = False N = 256 ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a) @ti.kernel def fill(): for i, j in a: a[i, j] = 2.0 return ti.benchmark(fill, repeat=700)
def benchmark_nested_range_blocked(): a = ti.field(dtype=ti.f32) N = 512 ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for X in range(N * N): for Y in range(64): a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0 return ti.benchmark(fill, repeat=800)
def benchmark_nested_range(): a = ti.field(dtype=ti.f32) N = 512 ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for j in range(N * 8): for i in range(N * 8): a[i, j] = 2.0 return ti.benchmark(fill, repeat=1000)
def benchmark_nested_struct(): a = ti.field(dtype=ti.f32) N = 512 ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for i, j in ti.ndrange(N * 8, N * 8): a[i, j] = 2.0 fill() return ti.benchmark(fill)
def benchmark_nested_range(): a = ti.var(dt=ti.f32) N = 512 @ti.layout def place(): ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for j in range(N * 8): for i in range(N * 8): a[i, j] = 2.0 return ti.benchmark(fill)
def benchmark_nested_struct(): a = ti.var(dt=ti.f32) N = 512 @ti.layout def place(): ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for i, j in a: a[i, j] = 2.0 fill() return ti.benchmark(fill)
def benchmark_nested_range_blocked(): a = ti.var(dt=ti.f32) N = 512 @ti.layout def place(): ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): for X in range(N * N): for Y in range(64): a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0 fill() return ti.benchmark(fill)