Ejemplo n.º 1
0
def fill(arch, dtype, dsize, repeat=10):

    repeat = scale_repeat(arch, dsize, repeat)
    n = dsize // dtype_size[dtype]

    ## fill x
    x = ti.field(dtype, shape=n)

    if dtype in [ti.f32, ti.f64]:

        @ti.kernel
        def fill_const(n: ti.i32):
            for i in range(n):
                x[i] = 0.1
    else:

        @ti.kernel
        def fill_const(n: ti.i32):
            for i in range(n):
                x[i] = 1

    # compile the kernel first
    fill_const(n)
    ti.sync()
    ti.kernel_profiler_clear()
    ti.sync()
    for i in range(repeat):
        fill_const(n)
    ti.sync()
    kernelname = fill_const.__name__
    suffix = "_c"
    quering_result = ti.query_kernel_profiler(kernelname + suffix)
    return quering_result.min
Ejemplo n.º 2
0
def reduction(arch, dtype, dsize, repeat=10):

    repeat = scale_repeat(arch, dsize, repeat)
    n = dsize // dtype_size[dtype]

    ## fill x
    x = ti.field(dtype, shape=n)

    if dtype in [ti.f32, ti.f64]:

        @ti.kernel
        def fill_const(n: ti.i32):
            for i in range(n):
                x[i] = 0.1
    else:

        @ti.kernel
        def fill_const(n: ti.i32):
            for i in range(n):
                x[i] = 1

    # compile the kernel first
    fill_const(n)

    ## reduce
    y = ti.field(dtype, shape=())
    if dtype in [ti.f32, ti.f64]:
        y[None] = 0.0
    else:
        y[None] = 0

    @ti.kernel
    def reduction(n: ti.i32):
        for i in range(n):
            y[None] += ti.atomic_add(y[None], x[i])

    # compile the kernel first
    reduction(n)
    ti.sync()
    ti.kernel_profiler_clear()
    ti.sync()
    for i in range(repeat):
        reduction(n)
    ti.sync()
    kernelname = reduction.__name__
    suffix = "_c"
    quering_result = ti.query_kernel_profiler(kernelname + suffix)
    return quering_result.min
Ejemplo n.º 3
0
    def run_benchmark():
        compile_time = time.time()
        func(*args)  # compile the kernel first
        ti.sync()
        compile_time = time.time() - compile_time
        ti.stat_write('compilation_time', compile_time)
        codegen_stat = _ti_core.stat()
        for line in codegen_stat.split('\n'):
            try:
                a, b = line.strip().split(':')
            except:
                continue
            a = a.strip()
            b = int(float(b))
            if a == 'codegen_kernel_statements':
                ti.stat_write('compiled_inst', b)
            if a == 'codegen_offloaded_tasks':
                ti.stat_write('compiled_tasks', b)
            elif a == 'launched_tasks':
                ti.stat_write('launched_tasks', b)

        # Use 3 initial iterations to warm up
        # instruction/data caches. Discussion:
        # https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
        for i in range(3):
            func(*args)
            ti.sync()
        ti.kernel_profiler_clear()
        t = time.time()
        for n in range(repeat):
            func(*args)
            ti.sync()
        elapsed = time.time() - t
        avg = elapsed / repeat
        ti.stat_write('wall_clk_t', avg)
        device_time = ti.kernel_profiler_total_time()
        avg_device_time = device_time / repeat
        ti.stat_write('exec_t', avg_device_time)
Ejemplo n.º 4
0
def benchmark():
    print(
        'Also check "nvprof --print-gpu-trace python3 diffmpm_benchmark.py" for more accurate results'
    )
    iters = 100000
    for i in range(1):
        p2g(0)
        grid_op()
        g2p(0)
    ti.sync()
    ti.kernel_profiler_clear()
    t = time.time()
    for i in range(iters):
        # clear_grid()
        p2g(0)
        grid_op()
        g2p(0)
    ti.sync()
    print('forward ', (time.time() - t) / iters * 1000 * 3, 'ms')
    ti.kernel_profiler_print()

    for i in range(1):
        p2g.grad(0)
        grid_op.grad()
        g2p.grad(0)
    ti.sync()
    ti.kernel_profiler_clear()
    t = time.time()
    for i in range(iters):
        # clear_grid()
        g2p.grad(0)
        grid_op.grad()
        p2g.grad(0)
    ti.sync()
    print('backward ', (time.time() - t) / iters * 1000 * 3, 'ms')
    ti.kernel_profiler_print()