Exemple #1
0
def deep_hierarchy(scale):
    branching = 4
    num_levels = 8 + int(math.log(scale, branching))

    x = ti.field(dtype=ti.f32)

    n = 256 * 1024 * scale

    assert n % (branching**num_levels) == 0

    snode = ti.root
    for i in range(num_levels):
        snode = snode.pointer(ti.i, branching)

    snode.dense(ti.i, n // (branching**num_levels)).place(x)

    @ti.kernel
    def initialize():
        for i in range(n):
            x[i] = 0

    initialize()

    # Not fusible, but no modification to the mask/list of x either
    @ti.kernel
    def jitter():
        for i in x:
            if i % 2 == 0:
                x[i] += x[i + 1]

    def task():
        for i in range(5):
            jitter()

    ti.benchmark(task, repeat=5)
Exemple #2
0
def stencil_reduction():
    a = ti.field(dtype=ti.f32)
    b = ti.field(dtype=ti.f32)
    total = ti.field(dtype=ti.f32, shape=())

    block_count = 1024
    block_size = 1024
    # a, b always share the same sparsity
    ti.root.pointer(ti.i, block_count).dense(ti.i, block_size).place(a, b)

    @ti.kernel
    def initialize():
        for i in range(block_size, (block_size - 1) * block_count):
            a[i] = i

    @ti.kernel
    def stencil():
        for i in a:
            b[i] = a[i - 1] + a[i] + a[i + 1]

    @ti.kernel
    def reduce():
        for i in a:
            total[None] += b[i]

    def task():
        for i in range(2):
            initialize()
            stencil()
            reduce()

    ti.benchmark(task, repeat=100)
Exemple #3
0
def multires(scale):
    num_levels = 4

    x = []
    for i in range(num_levels):
        x.append(ti.field(dtype=ti.f32))

    # TODO: Using 1024 instead of 512 hangs the CUDA case. Need to figure out why.
    n = 512 * 1024 * scale

    block_size = 16
    assert n % block_size**2 == 0

    for i in range(num_levels):
        ti.root.pointer(ti.i, n // 2**i // block_size**2).pointer(
            ti.i, block_size).dense(ti.i, block_size).place(x[i])

    @ti.kernel
    def initialize():
        for i in range(n):
            x[0][i] = i

    @ti.kernel
    def downsample(l: ti.template()):
        for i in x[l]:
            if i % 2 == 0:
                x[l + 1][i // 2] = x[l][i]

    initialize()

    def task():
        for l in range(num_levels - 1):
            downsample(l)

    ti.benchmark(task, repeat=5)
Exemple #4
0
def sparse_numpy(scale):
    import math
    a = ti.field(dtype=ti.f32)
    b = ti.field(dtype=ti.f32)

    block_count = 2**int((math.log(scale, 2)) // 2) * 64
    block_size = 32
    # a, b always share the same sparsity
    ti.root.pointer(ti.ij, block_count).dense(ti.ij, block_size).place(a, b)

    @ti.kernel
    def initialize():
        for i, j in ti.ndrange(block_count * block_size,
                               block_count * block_size):
            if (i // block_size + j // block_size) % 4 == 0:
                a[i, j] = i + j

    @ti.kernel
    def saxpy(x: ti.template(), y: ti.template(), alpha: ti.f32):
        for i, j in x:
            y[i, j] = alpha * x[i, j] + y[i, j]

    def task():
        initialize()
        saxpy(a, b, 2)
        saxpy(b, a, 1.1)
        saxpy(b, a, 1.1)
        saxpy(a, b, 1.1)
        saxpy(a, b, 1.1)
        saxpy(a, b, 1.1)

    ti.benchmark(task, repeat=10)
Exemple #5
0
def autodiff(scale):

    n = 1024**2 * scale

    a = ti.field(dtype=ti.f32, shape=n, needs_grad=True)
    b = ti.field(dtype=ti.f32, shape=n)
    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)

    @ti.kernel
    def compute_loss():
        for i in a:
            loss[None] += a[i]

    @ti.kernel
    def accumulate_grad():
        for i in a:
            b[i] += a.grad[i]

    def task():
        with ti.Tape(loss=loss):
            # The forward kernel of compute_loss should be completely eliminated (except for the last one)
            compute_loss()

        accumulate_grad()

    ti.benchmark(task, repeat=100)
Exemple #6
0
def fill_scalar():
    a = ti.field(dtype=ti.f32, shape=())

    @ti.kernel
    def fill():
        a[None] = 1.0

    ti.benchmark(fill, repeat=1000)
Exemple #7
0
def fill_1d():
    a = ti.field(dtype=ti.f32, shape=100 * 1024**2)

    @ti.kernel
    def fill():
        for i in a:
            a[i] = 1.0

    ti.benchmark(fill, repeat=100)
def template_fuse_dense_x2y2z(size=1024**3,
                              repeat=10,
                              first_n=100,
                              benchmark=0,
                              benchmark_repeat=50):
    x = ti.field(ti.i32, shape=(size, ))
    y = ti.field(ti.i32, shape=(size, ))
    z = ti.field(ti.i32, shape=(size, ))
    first_n = min(first_n, size)

    @ti.kernel
    def x_to_y():
        for i in x:
            y[i] = x[i] + 1

    @ti.kernel
    def y_to_z():
        for i in x:
            z[i] = y[i] + 4

    def x_to_y_to_z():
        x_to_y()
        y_to_z()

    for i in range(first_n):
        x[i] = i * 10

    if benchmark:
        ti.benchmark(x_to_y_to_z, repeat=benchmark_repeat)
    else:
        # Simply test
        for _ in range(repeat):
            t = time.time()
            x_to_y()
            ti.sync()
            print('x_to_y', time.time() - t)

        for _ in range(repeat):
            t = time.time()
            y_to_z()
            ti.sync()
            print('y_to_z', time.time() - t)

        for _ in range(repeat):
            t = time.time()
            x_to_y_to_z()
            ti.sync()
            print('fused x->y->z', time.time() - t)

        for i in range(first_n):
            assert x[i] == i * 10
            assert y[i] == x[i] + 1
            assert z[i] == x[i] + 5
Exemple #9
0
def fill_scalar(scale):
    a = ti.field(dtype=ti.f32, shape=())

    @ti.kernel
    def fill():
        a[None] = 1.0

    def repeated_fill():
        for _ in range(1000):
            fill()

    ti.benchmark(repeated_fill, repeat=5)
Exemple #10
0
def fill_1d(scale):
    a = ti.field(dtype=ti.f32, shape=scale * 10 * 1024**2)

    @ti.kernel
    def fill():
        for i in a:
            a[i] = 1.0

    def repeated_fill():
        for _ in range(10):
            fill()

    ti.benchmark(repeated_fill, repeat=10)
def template_fuse_reduction(size=1024**3,
                            repeat=10,
                            first_n=100,
                            benchmark=0,
                            benchmark_repeat=50):
    x = ti.field(ti.i32, shape=(size, ))
    first_n = min(first_n, size)

    @ti.kernel
    def reset():
        for i in range(first_n):
            x[i] = i * 10

    @ti.kernel
    def inc():
        for i in x:
            x[i] = x[i] + 1

    if benchmark:

        def repeated_inc():
            for _ in range(repeat):
                inc()

        ti.benchmark(repeated_inc, repeat=benchmark_repeat)
    else:
        # Simply test
        reset()
        ti.sync()
        for _ in range(repeat):
            t = time.time()
            inc()
            ti.sync()
            print('single inc', time.time() - t)

        reset()
        ti.sync()
        t = time.time()
        for _ in range(repeat):
            inc()
        ti.sync()
        duration = time.time() - t
        print(
            f'fused {repeat} inc: total={duration} average={duration / repeat}'
        )

        for i in range(first_n):
            assert x[i] == i * 10 + repeat
Exemple #12
0
def benchmark_fill_scalar():
    a = ti.var(dt=ti.f32, shape=())

    @ti.kernel
    def fill():
        a[None] = 1.0

    return ti.benchmark(fill)
Exemple #13
0
def benchmark_memset():
    a = ti.var(dt=ti.f32, shape=N)

    @ti.kernel
    def memset():
        for i in a:
            a[i] = 1.0

    return ti.benchmark(memset, repeat=10)
Exemple #14
0
def benchmark_sscal():
    a = ti.var(dt=ti.f32, shape=N)

    @ti.kernel
    def task():
        for i in a:
            a[i] = 0.5 * a[i]

    return ti.benchmark(task, repeat=10)
Exemple #15
0
def benchmark_memcpy():
    a = ti.var(dt=ti.f32, shape=N)
    b = ti.var(dt=ti.f32, shape=N)

    @ti.kernel
    def memcpy():
        for i in a:
            a[i] = b[i]

    return ti.benchmark(memcpy, repeat=10)
Exemple #16
0
def benchmark_flat_struct():
    N = 4096
    a = ti.field(dtype=ti.f32, shape=(N, N))

    @ti.kernel
    def fill():
        for i, j in a:
            a[i, j] = 2.0

    return ti.benchmark(fill, repeat=500)
Exemple #17
0
def benchmark_flat_range():
    N = 4096
    a = ti.field(dtype=ti.f32, shape=(N, N))

    @ti.kernel
    def fill():
        for i, j in ti.ndrange(N, N):
            a[i, j] = 2.0

    return ti.benchmark(fill, repeat=700)
Exemple #18
0
def benchmark_flat_range():
  N = 4096
  a = ti.var(dt=ti.f32, shape=(N, N))
  
  @ti.kernel
  def fill():
    for i, j in ti.ndrange(N, N):
        a[i, j] = 2.0
  
  return ti.benchmark(fill)
Exemple #19
0
def benchmark_flat_struct():
    N = 4096
    a = ti.var(dt=ti.f32, shape=(N, N))

    @ti.kernel
    def fill():
        for i, j in a:
            a[i, j] = 2.0

    return ti.benchmark(fill)
Exemple #20
0
def benchmark_nested_struct():
    a = ti.field(dtype=ti.f32)
    N = 512

    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for i, j in a:
            a[i, j] = 2.0

    return ti.benchmark(fill, repeat=700)
Exemple #21
0
def benchmark_saxpy():
    x = ti.var(dt=ti.f32, shape=N)
    y = ti.var(dt=ti.f32, shape=N)
    z = ti.var(dt=ti.f32, shape=N)

    @ti.kernel
    def task():
        for i in x:
            a = 123
            z[i] = a * x[i] + y[i]

    return ti.benchmark(task, repeat=10)
Exemple #22
0
def benchmark_nested_struct_listgen_8x8():
    a = ti.var(dt=ti.f32)
    ti.cfg.demote_dense_struct_fors = False
    N = 512

    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for i, j in a:
            a[i, j] = 2.0

    return ti.benchmark(fill, repeat=1000)
Exemple #23
0
def benchmark_root_listgen():
    a = ti.field(dtype=ti.f32)
    ti.cfg.demote_dense_struct_fors = False
    N = 512

    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for i, j in a.parent():
            a[i, j] = 2.0

    return ti.benchmark(fill, repeat=800)
Exemple #24
0
def benchmark_nested_struct_listgen_16x16():
    a = ti.field(dtype=ti.f32)
    ti.cfg.demote_dense_struct_fors = False
    N = 256

    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a)

    @ti.kernel
    def fill():
        for i, j in a:
            a[i, j] = 2.0

    return ti.benchmark(fill, repeat=700)
Exemple #25
0
def benchmark_nested_range_blocked():
    a = ti.field(dtype=ti.f32)
    N = 512

    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for X in range(N * N):
            for Y in range(64):
                a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0

    return ti.benchmark(fill, repeat=800)
Exemple #26
0
def benchmark_nested_range():
    a = ti.field(dtype=ti.f32)
    N = 512

    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for j in range(N * 8):
            for i in range(N * 8):
                a[i, j] = 2.0

    return ti.benchmark(fill, repeat=1000)
def benchmark_nested_struct():
    a = ti.field(dtype=ti.f32)
    N = 512

    ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for i, j in ti.ndrange(N * 8, N * 8):
            a[i, j] = 2.0

    fill()

    return ti.benchmark(fill)
Exemple #28
0
def benchmark_nested_range():
    a = ti.var(dt=ti.f32)
    N = 512

    @ti.layout
    def place():
        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for j in range(N * 8):
            for i in range(N * 8):
                a[i, j] = 2.0

    return ti.benchmark(fill)
Exemple #29
0
def benchmark_nested_struct():
    a = ti.var(dt=ti.f32)
    N = 512

    @ti.layout
    def place():
        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for i, j in a:
            a[i, j] = 2.0

    fill()

    return ti.benchmark(fill)
Exemple #30
0
def benchmark_nested_range_blocked():
    a = ti.var(dt=ti.f32)
    N = 512

    @ti.layout
    def place():
        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

    @ti.kernel
    def fill():
        for X in range(N * N):
            for Y in range(64):
                a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0

    fill()

    return ti.benchmark(fill)