def step(self, frame_dt, print_stat=False): begin_t = time.time() begin_substep = self.total_substeps substeps = int(frame_dt / self.default_dt) + 1 for i in range(substeps): self.total_substeps += 1 dt = frame_dt / substeps self.grid.deactivate_all() self.build_pid() self.p2g(dt) self.grid_normalization_and_gravity(dt) for p in self.grid_postprocess: p(dt) self.g2p(dt) if print_stat: ti.kernel_profiler_print() try: ti.memory_profiler_print() except: pass print(f'num particles={self.n_particles[None]}') print(f' frame time {time.time() - begin_t:.3f} s') print( f' substep time {1000 * (time.time() - begin_t) / (self.total_substeps - begin_substep):.3f} ms' )
def main(): initialize() vertices_ = vertices.to_numpy() while gui.running and not gui.get_event(gui.ESCAPE): for s in range(int(1e-2 // dt)): grid_m.fill(0) grid_v.fill(0) # Note that we are now differentiating the total energy w.r.t. the particle position. # Recall that F = - \partial (total_energy) / \partial x with ti.Tape(total_energy): # Do the forward computation of total energy and backward propagation for x.grad, which is later used in p2g compute_total_energy() # It's OK not to use the computed total_energy at all, since we only need x.grad p2g() grid_op() g2p() gui.circle((0.5, 0.5), radius=45, color=0x068587) particle_pos = x.to_numpy() a = vertices_.reshape(n_elements * 3) b = np.roll(vertices_, shift=1, axis=1).reshape(n_elements * 3) gui.lines(particle_pos[a], particle_pos[b], radius=1, color=0x4FB99F) gui.circles(particle_pos, radius=1.5, color=0xF2B134) gui.line((0.00, 0.03 / quality), (1.0, 0.03 / quality), color=0xFFFFFF, radius=3) gui.show() ti.kernel_profiler_print()
def run(self): gui = ti.GUI("Multigrid Preconditioned Conjugate Gradients", res=(self.N_gui, self.N_gui)) self.init() self.solve(max_iters=400) self.paint() ti.imshow(self.pixels) ti.kernel_profiler_print()
def step(self, frame_dt, print_stat=False): begin_t = time.time() begin_substep = self.total_substeps substeps = int(frame_dt / self.default_dt) + 1 if print_stat: print(f'needed substeps: {substeps}') for i in range(substeps): print('.', end='', flush=True) self.total_substeps += 1 dt = frame_dt / substeps if self.use_g2p2g: output_grid = 1 - self.input_grid self.grid[output_grid].deactivate_all() self.build_pid(self.pid[self.input_grid], self.grid_m[self.input_grid], 0.5) self.g2p2g(dt, self.pid[self.input_grid], self.grid_v[self.input_grid], self.grid_v[output_grid], self.grid_m[output_grid]) self.grid_normalization_and_gravity(dt, self.grid_v[output_grid], self.grid_m[output_grid]) for p in self.grid_postprocess: p(self.t, dt, self.grid_v[output_grid]) self.input_grid = output_grid self.t += dt else: self.grid.deactivate_all() self.build_pid(self.pid, self.grid_m, 0.5) self.p2g(dt) self.grid_normalization_and_gravity(dt, self.grid_v, self.grid_m) for p in self.grid_postprocess: p(self.t, dt, self.grid_v) self.t += dt self.g2p(dt) self.all_time_max_velocity = max(self.all_time_max_velocity, self.compute_max_velocity()) print() if print_stat: ti.kernel_profiler_print() try: ti.memory_profiler_print() except: pass print(f'CFL: {self.all_time_max_velocity * dt / self.dx}') print(f'num particles={self.n_particles[None]}') print(f' frame time {time.time() - begin_t:.3f} s') print( f' substep time {1000 * (time.time() - begin_t) / (self.total_substeps - begin_substep):.3f} ms' )
def print_async_stats(include_kernel_profiler=False): import taichi as ti if include_kernel_profiler: ti.kernel_profiler_print() print() stat = ti.get_kernel_stats() counters = stat.get_counters() print('=======================') print('Async benchmark metrics') print('-----------------------') print(f'Async mode: {ti.current_cfg().async_mode}') print(f'Kernel time: {ti.kernel_profiler_total_time():.3f} s') print(f'Tasks launched: {int(counters["launched_tasks"])}') print(f'Instructions emitted: {int(counters["codegen_statements"])}') print(f'Tasks compiled: {int(counters["codegen_offloaded_tasks"])}') print('=======================')
def benchmark(): print( 'Also check "nvprof --print-gpu-trace python3 diffmpm_benchmark.py" for more accurate results' ) iters = 100000 for i in range(1): p2g(0) grid_op() g2p(0) ti.sync() ti.kernel_profiler_clear() t = time.time() for i in range(iters): # clear_grid() p2g(0) grid_op() g2p(0) ti.sync() print('forward ', (time.time() - t) / iters * 1000 * 3, 'ms') ti.kernel_profiler_print() for i in range(1): p2g.grad(0) grid_op.grad() g2p.grad(0) ti.sync() ti.kernel_profiler_clear() t = time.time() for i in range(iters): # clear_grid() g2p.grad(0) grid_op.grad() p2g.grad(0) ti.sync() print('backward ', (time.time() - t) / iters * 1000 * 3, 'ms') ti.kernel_profiler_print()
from smoke_animation import Smoke_Animation from colliders import RigidBodyCollier, Collider from geometry import Box, Ball res = (512, 512) ti.init(arch=ti.gpu, kernel_profiler=True) gui = ti.GUI("smoke animation", res=res) # build smoke solver smoke = \ Smoke_Builder(res) \ .add_flow_emitter([512//2 , 0] , 512//3 , 2000.0) \ .set_decay(0.995) \ .build() # .set_compute_buoyancy_force(tempreture_factor=600)\ ani = Smoke_Animation(smoke, res) ani.reset() # collider smoke.add_collider(RigidBodyCollier(Box([156, 156], [226, 276]))) smoke.add_collider(RigidBodyCollier(Ball([276, 226], 30))) while gui.running: ani.update() ani.display(gui) ti.kernel_profiler_print()
def bls_test_template(dim, N, bs, stencil, block_dim=None, scatter=False, benchmark=0, dense=False): x, y, y2 = ti.field(ti.i32), ti.field(ti.i32), ti.field(ti.i32) index = ti.indices(*range(dim)) mismatch = ti.field(ti.i32, shape=()) if not isinstance(bs, (tuple, list)): bs = [bs for _ in range(dim)] grid_size = [N // bs[i] for i in range(dim)] if dense: create_block = lambda: ti.root.dense(index, grid_size) else: create_block = lambda: ti.root.pointer(index, grid_size) if scatter: block = create_block() block.dense(index, bs).place(x) block.dense(index, bs).place(y) block.dense(index, bs).place(y2) else: create_block().dense(index, bs).place(x) create_block().dense(index, bs).place(y) create_block().dense(index, bs).place(y2) ndrange = ((bs[i], N - bs[i]) for i in range(dim)) if block_dim is None: block_dim = 1 for i in range(dim): block_dim *= bs[i] @ti.kernel def populate(): for I in ti.grouped(ti.ndrange(*ndrange)): s = 0 for i in ti.static(range(dim)): s += I[i]**(i + 1) x[I] = s @ti.kernel def apply(use_bls: ti.template(), y: ti.template()): if ti.static(use_bls and not scatter): ti.cache_shared(x) if ti.static(use_bls and scatter): ti.cache_shared(y) ti.block_dim(block_dim) for I in ti.grouped(x): if ti.static(scatter): for offset in ti.static(stencil): y[I + ti.Vector(offset)] += x[I] else: # gather s = 0 for offset in ti.static(stencil): s = s + x[I + ti.Vector(offset)] y[I] = s populate() if benchmark: for i in range(benchmark): x.snode.parent().deactivate_all() if not scatter: populate() y.snode.parent().deactivate_all() y2.snode.parent().deactivate_all() apply(False, y2) apply(True, y) else: # Simply test apply(False, y2) apply(True, y) @ti.kernel def check(): for I in ti.grouped(y2): if y[I] != y2[I]: print('check failed', I, y[I], y2[I]) mismatch[None] = 1 check() ti.kernel_profiler_print() assert mismatch[None] == 0
def main(): # initialization init_v[None] = [0, 0] for i in range(n_particles): F[0, i] = [[1, 0], [0, 1]] for i in range(N): for j in range(N): x[0, i * N + j] = [dx * (i * 0.5 + 10), dx * (j * 0.5 + 25)] set_v() benchmark() losses = [] img_count = 0 for i in range(30): with ti.Tape(loss=loss): set_v() for s in range(steps - 1): substep(s) loss[None] = 0 x_avg[None] = [0, 0] compute_x_avg() compute_loss() l = loss[None] losses.append(l) grad = init_v.grad[None] print('loss=', l, ' grad=', (grad[0], grad[1])) learning_rate = 10 init_v(0)[None] -= learning_rate * grad[0] init_v(1)[None] -= learning_rate * grad[1] # visualize for s in range(63, steps, 64): scale = 4 img = np.zeros(shape=(scale * n_grid, scale * n_grid)) + 0.3 total = [0, 0] for i in range(n_particles): p_x = int(scale * x(0)[s, i] / dx) p_y = int(scale * x(1)[s, i] / dx) total[0] += p_x total[1] += p_y img[p_x, p_y] = 1 cv2.circle(img, (total[1] // n_particles, total[0] // n_particles), radius=5, color=0, thickness=5) cv2.circle(img, (int( target[1] * scale * n_grid), int(target[0] * scale * n_grid)), radius=5, color=1, thickness=5) img = img.swapaxes(0, 1)[::-1] cv2.imshow('MPM', img) img_count += 1 # cv2.imwrite('MPM{:04d}.png'.format(img_count), img * 255) cv2.waitKey(1) ti.kernel_profiler_print() ti.kernel_profiler_print() plt.title("Optimization of Initial Velocity") plt.ylabel("Loss") plt.xlabel("Gradient Descent Iterations") plt.plot(losses) plt.show()
def run(self, verbose=False): self.init() self.solve(max_iters=400, verbose=verbose) self.paint() ti.imshow(self.pixels) ti.kernel_profiler_print()
def run(self): gui = ti.GUI("Multigrid Preconditioned Conjugate Gradients", res=(self.N_gui, self.N_gui)) self.init() self.reduce(self.r[0], self.r[0]) initial_rTr = self.sum[None] # self.r = b - Ax = b since self.x = 0 # self.p = self.r = self.r + 0 self.p if self.use_multigrid: self.apply_preconditioner() else: self.z[0].copy_from(self.r[0]) self.update_p() self.reduce(self.z[0], self.r[0]) old_zTr = self.sum[None] # CG for i in range(400): # self.alpha = rTr / pTAp self.compute_Ap() self.reduce(self.p, self.Ap) pAp = self.sum[None] self.alpha[None] = old_zTr / pAp # self.x = self.x + self.alpha self.p self.update_x() # self.r = self.r - self.alpha self.Ap self.update_r() # check for convergence self.reduce(self.r[0], self.r[0]) rTr = self.sum[None] if rTr < initial_rTr * 1.0e-12: break # self.z = M^-1 self.r if self.use_multigrid: self.apply_preconditioner() else: self.z[0].copy_from(self.r[0]) # self.beta = new_rTr / old_rTr self.reduce(self.z[0], self.r[0]) new_zTr = self.sum[None] self.beta[None] = new_zTr / old_zTr # self.p = self.z + self.beta self.p self.update_p() old_zTr = new_zTr print(f'iter {i}, residual={rTr}') self.paint() gui.set_image(self.pixels) gui.show() ti.kernel_profiler_print()