def test_atomic_xor(self): expected_result: int = self.i_1 ^ self.i_2 pk.parallel_for(self.range_policy, self.functor.atomic_xor) result: int = self.functor.view1D_xor[0] self.assertEqual(expected_result, result)
def run() -> None: parser = argparse.ArgumentParser() parser.add_argument('iterations', type=int) parser.add_argument('length', type=int) parser.add_argument('offset', nargs='?', type=int, default=0) args = parser.parse_args() iterations = args.iterations length = args.length offset = args.offset scalar = 3 if iterations < 1: sys.exit("ERROR: iterations must be >= 1") if length <= 0: sys.exit("ERROR: vector length must be positive") # emulate cpp example if length <= 0: sys.exit("ERROR: offset must be nonnegative") print("Number of iterations = ", iterations) print("Vector length = ", length) print("Offset = ", offset) p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, length) w = Workload(iterations, length, offset, scalar) pk.parallel_for(p, w.init_views) # pk.fence() timer = pk.Timer() for i in range(iterations): pk.parallel_for(p, w.nstream) # pk.fence() nstream_time = timer.seconds() # verify correctness ar: float = 0 br: float = 2 cr: float = 2 for i in range(iterations): ar += br + scalar * cr ar *= length asum = pk.parallel_reduce(p, w.res_reduce) # pk.fence() episilon: float = 1.0e-8 if (abs(ar - asum) / asum > episilon): print("ERROR: Failed Valication on output array") else: avgtime: float = nstream_time / iterations nbytes: float = 4.0 * length * 4 print("Solution validates") print("Rate (MB/s): %.2f" % (1.e-6 * nbytes / avgtime)) print("Avg time (ms): %f" % (avgtime / 1.e-3))
def test_atomic_min(self): expected_result: float = min(self.f_1, self.f_2) pk.parallel_for(self.range_policy, self.functor.atomic_min) result: float = self.functor.view1D_min[0] self.assertEqual(expected_result, result)
def test_atomic_sub(self): expected_result: float = self.f_1 - self.f_2 pk.parallel_for(self.range_policy, self.functor.atomic_sub) result: float = self.functor.view1D_sub[0] self.assertEqual(expected_result, result)
def benchmark(team: pk.TeamMember, A: pk.View3D[pk.double], B: pk.View3D[pk.double], C: pk.View3D[pk.double], R: int, F: int, K: int): n: int = team.league_rank() for r in range(R): def team_for(i: int): a1: pk.double = A[n][i][0] b: pk.double = B[n][i][0] a2: pk.double = a1 * 1.3 a3: pk.double = a2 * 1.1 a4: pk.double = a3 * 1.1 a5: pk.double = a4 * 1.3 a6: pk.double = a5 * 1.1 a7: pk.double = a6 * 1.1 a8: pk.double = a7 * 1.1 for f in range(F): a1 += b * a1 a2 += b * a2 a3 += b * a3 a4 += b * a4 a5 += b * a5 a6 += b * a6 a7 += b * a7 a8 += b * a8 C[n][i][0] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 pk.parallel_for(pk.TeamThreadRange(team, K), team_for)
def run(self): timer = pk.Timer() for r in range(self.R): pk.parallel_for("gather", self.N, self.benchmark) pk.fence() self.seconds = timer.seconds()
def benchmark(self, team: pk.TeamMember): n: int = team.league_rank() for r in range(self.R): def team_for(i: int): a1: pk.double = self.A[n][i][0] b: pk.double = self.B[n][i][0] a2: pk.double = a1 * 1.3 a3: pk.double = a2 * 1.1 a4: pk.double = a3 * 1.1 a5: pk.double = a4 * 1.3 a6: pk.double = a5 * 1.1 a7: pk.double = a6 * 1.1 a8: pk.double = a7 * 1.1 for f in range(self.F): a1 += b * a1 a2 += b * a2 a3 += b * a3 a4 += b * a4 a5 += b * a5 a6 += b * a6 a7 += b * a7 a8 += b * a8 self.C[n][i][0] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for)
def run(self) -> None: nbin: List[int] = [self.nbinx, self.nbiny, self.nbinz] min_values: List[float] = [self.minx, self.miny, self.miny] max_values: List[float] = [self.maxx, self.maxy, self.maxz] x_sub = self.x[self.range_min:self.range_max, :] binop = pk.BinOp3D(x_sub, nbin, min_values, max_values) sorter = pk.BinSort(x_sub, binop) sorter.create_permute_vector() self.permute_vector = sorter.get_permute_vector() self.bin_count_1d = sorter.get_bin_count() self.bin_offsets_1d = sorter.get_bin_offsets() pk.parallel_for("Binning::AssignOffsets", self.nbinx * self.nbiny * self.nbinz, self.assign_offsets) if self.sort: sorter.sort(x_sub) v_sub = self.v[self.range_min:self.range_max, :] sorter.sort(v_sub) f_sub = self.f[self.range_min:self.range_max, :] sorter.sort(f_sub) sorter.sort(self.type) sorter.sort(self.id) sorter.sort(self.q)
def test_v1d(self): pk.parallel_for(self.range_policy, self.functor.v1d) for i in range(self.i_1): expected_result: int = self.i_4 + i self.assertEqual(expected_result, self.functor.view1D[i]) self.assertEqual(expected_result, self.functor.myView1D[i])
def run(self): pk.parallel_for(self.N, lambda i: i, self.A) timer = pk.Timer() self.result = pk.parallel_scan(self.N, self.scan) self.timer_result = timer.seconds()
def test_v2d(self): pk.parallel_for(self.range_policy, self.functor.v2d) for i in range(self.i_1): for j in range(self.i_2): expected_result: int = self.i_4 + i + j self.assertEqual(expected_result, self.functor.view2D[i][j]) self.assertEqual(expected_result, self.functor.myView2D[i][j])
def run(self): timer = pk.Timer() pk.parallel_for(self.N, self.matrix_init) for i in range(self.nrepeat): self.result = pk.parallel_reduce("04", self.N, self.yAx) self.timer_result = timer.seconds()
def test_outer_for(self): expected_result: float = 0 for i in range(self.M): expected_result += self.value pk.parallel_for(pk.TeamPolicy(self.N, pk.AUTO, space=self.execution_space), self.functor.outer_for) for i in range(self.N): result: int = self.functor.for_view[i] self.assertEqual(expected_result, result)
def test_subscript(self): expected_result = self.i_1 pk.parallel_for(self.range_policy, self.functor.subscript) for i in range(self.threads): self.assertEqual(expected_result, self.functor.view1D[i]) for j in range(self.threads): self.assertEqual(expected_result, self.functor.view2D[i][j]) for k in range(self.threads): self.assertEqual(expected_result, self.functor.view3D[i][j][k])
def run(self) -> None: x: List[int] = [self.x_0, 2, 3] pk.parallel_for(self.total_threads, self.work) bin_op = pk.BinOp1D(self.view, (self.total_threads // 2), self.total_threads, self.total_threads * 2 - 1) bin_sort = pk.BinSort(self.view, bin_op) bin_sort.create_permute_vector() self.permute_vector = bin_sort.get_permute_vector() self.bin_offsets = bin_sort.get_bin_offsets() self.bin_count = bin_sort.get_bin_count() bin_sort.sort(self.view)
def yAx_plus1(self, team_member: pk.TeamMember, acc: pk.Acc[pk.double]) -> None: j: int = team_member.league_rank() def inner_reduce(i: int, inner_acc: pk.Acc[pk.double]): inner_acc += self.A[j][i] * self.x[i] def inner_for(i: int): self.yprime[j][i] += 1 temp2: float = pk.parallel_reduce(pk.TeamThreadRange(team_member, self.M), inner_reduce) pk.parallel_for(pk.TeamThreadRange(team_member, self.N), inner_for) if team_member.team_rank() == 0: acc += self.yprime[j][j] * temp2
def run(self): t: int = tile_size r: int = radius pk.parallel_for(pk.MDRangePolicy([0, 0], [n, n], [t, t]), self.init) pk.fence() timer = pk.Timer() for i in range(iterations): if (i == 1): pk.fence() if r == 1: # star1 stencil pk.parallel_for( "stencil", pk.MDRangePolicy([r, r], [n - r, n - r], [t, t]), self.star1) elif r == 2: # star2 stencil pk.parallel_for( "stencil", pk.MDRangePolicy([r, r], [n - r, n - r], [t, t]), self.star2) else: # star3 stencil pk.parallel_for( "stencil", pk.MDRangePolicy([r, r], [n - r, n - r], [t, t]), self.star3) pk.parallel_for(pk.MDRangePolicy([0, 0], [n, n], [t, t]), self.increment) pk.fence() self.stencil_time = timer.seconds() active_points: int = (n - 2 * r) * (n - 2 * r) # verify correctness self.norm = pk.parallel_reduce( pk.MDRangePolicy([r, r], [n - r, n - r], [t, t]), self.norm_reduce) pk.fence() self.norm /= active_points episilon: float = 1.0e-8 reference_norm: float = 2 * (iterations) if (abs(self.norm - reference_norm) > episilon): pk.printf("ERROR: L1 norm != Reference norm err=%.2f\n", abs(self.norm - reference_norm)) else: pk.printf("Solution validates\n")
def run() -> None: random.seed(1010101) indices = 8192 data = 33554432 repeats = 10 space = pk.ExecutionSpace.OpenMP parser = argparse.ArgumentParser() parser.add_argument("--indices", type=int) parser.add_argument("--data", type=int) parser.add_argument("--repeats", type=int) parser.add_argument("--atomics", action="store_true") parser.add_argument("--execution_space", type=str) args = parser.parse_args() if args.indices: indices = args.indices if args.data: data = args.data if args.repeats: repeats = args.repeats use_atomics = args.atomics if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) w = Benchmark(indices, data, repeats, use_atomics) range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices) range_data = pk.RangePolicy(pk.get_default_space(), 0, data) print("Reports fastest timing per kernel") print("Creating Views...") print("Memory Sizes:") print(f"- Elements: {data} ({1e-6*data*8} MB)") print(f"- Indices: {indices} ({1e-6*indices*8} MB)") print(f"- Atomics: {'yes' if use_atomics else 'no'}") print(f"Benchmark kernels will be performed for {repeats} iterations") print("Initializing Views...") pk.parallel_for(range_data, w.init_data) pk.parallel_for(range_indices, w.init_indices) print("Starting benchmarking...") timer = pk.Timer() for i in range(repeats): for i in range(indices): w.indices[i] = random.randrange(data) if use_atomics: pk.parallel_for(range_indices, w.run_gups_atomic) else: pk.parallel_for(range_indices, w.run_gups) gupsTime = timer.seconds() print(f"GUP/s Random: {1e-9 * repeats * indices / gupsTime}") print(w.data)
def run(self) -> None: if self.parallel_for: if self.half_neigh: pk.parallel_for("ForceLJNeigh::compute", self.N_local, self.halfneigh_for) else: pk.parallel_for("ForceLJNeigh::compute", self.N_local, self.fullneigh_for) else: if self.half_neigh: self.energy = pk.parallel_reduce( "ForceLJNeigh::compute_energy", self.N_local, self.halfneigh_reduce) else: self.energy = pk.parallel_reduce( "ForceLJNeigh::compute_energy", self.N_local, self.fullneigh_reduce)
def run(self) -> None: if self.workunit_id == 0: pk.parallel_for("CommSerial::exchange_self", self.N_local, self.tag_exchange_self) elif self.workunit_id == 1: pk.parallel_for("CommSerial::halo_exchange_self", self.nparticles, self.tag_halo_self) elif self.workunit_id == 2: pk.parallel_for("CommSerial::halo_update_self", self.update_threads, self.tag_halo_update_self) elif self.workunit_id == 3: pk.parallel_for("CommSerial::halo_force_self", self.force_threads, self.tag_halo_force_self)
def test_real(self): pk.set_default_precision(pk.int32) view: pk.View1d = pk.View([self.threads]) self.assertTrue(view.dtype is pk.DataType.int32) self.assertTrue( pk.View._get_dtype_name(str(type(view.array))) == "int32") f = RealViewTestFunctor(view) w = RealViewTestWorkload(self.threads, view) pk.parallel_for(self.threads, f.pfor) pk.execute(pk.ExecutionSpace.Default, w) view.set_precision(pk.float) self.assertTrue(view.dtype is pk.DataType.float) self.assertTrue( pk.View._get_dtype_name(str(type(view.array))) == "float32") pk.parallel_for(self.threads, f.pfor) pk.execute(pk.ExecutionSpace.Default, w)
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 1 print(f"Total size S = {N * M} N = {N} M = {M}") y = pk.View([N], pk.double) x = pk.View([M], pk.double) A = pk.View([N * M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") p = pk.RangePolicy(pk.get_default_space(), 0, N) w = Workload(N, M) pk.parallel_for(p, w.y_init) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init) pk.parallel_for(p, w.matrix_init) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run(self): pk.parallel_for(self.array_size, self.init_arrays) timer = pk.Timer() for i in range(self.num_times): pk.parallel_for("babel_stream", self.array_size, self.copy) pk.fence() # self.runtimes[0][i] = timer.seconds() # timer.reset() pk.parallel_for("babel_stream", self.array_size, self.mul) pk.fence() # self.runtimes[1][i] = timer.seconds() # timer.reset() pk.parallel_for("babel_stream", self.array_size, self.add) pk.fence() pk.parallel_for("babel_stream", self.array_size, self.triad) pk.fence() self.sum = pk.parallel_reduce("babel_stream", self.array_size, self.dot) self.runtime = timer.seconds()
def run(self): pk.parallel_for(self.length, self.init) # pk.parallel_for(self.length, lambda i: 0, self.A) # pk.parallel_for(self.length, lambda i: 2, self.B) # pk.parallel_for(self.length, lambda i: 2, self.C) pk.fence() timer = pk.Timer() for i in range(self.iterations): pk.parallel_for("nstream", self.length, self.nstream) pk.fence() self.nstream_time = timer.seconds() # verify correctness ar: float = 0 br: float = 2 cr: float = 2 for i in range(self.iterations): ar += br + self.scalar * cr ar *= self.length self.asum = pk.parallel_reduce(self.length, lambda i, acc: acc + abs(self.A[i])) pk.fence() episilon: float = 1.0e-8 if (abs(ar - self.asum) / self.asum > episilon): pk.printf("ERROR: Failed Valication on output array\n") else: avgtime: float = self.nstream_time / self.iterations nbytes: float = 4.0 * self.length * 4 pk.printf("Solution validates\n") pk.printf("Rate (MB/s): %.2f\n", 1.e-6 * nbytes / avgtime) pk.printf("Avg time (ms): %f\n", avgtime / 1.e-3)
def run(self): pk.parallel_for(N, self.init_y) pk.parallel_for(M, self.init_x) pk.parallel_for(pk.MDRangePolicy([0, 0], [self.N, self.M]), self.init_A) timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("mdrange", self.N, self.yAx) self.timer_result = timer.seconds()
def run(self): pk.parallel_for(self.N, self.y_init) # pk.parallel_for(self.N, lambda i : self.y[i] = 1) pk.parallel_for(self.M, self.x_init) # pk.parallel_for(self.N, lambda i : self.x[i] = 1) pk.parallel_for(self.N, self.matrix_init) timer = pk.Timer() for i in range(self.nrepeat): self.result = pk.parallel_reduce("01", self.N, self.yAx) self.timer_result = timer.seconds()
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] fill: bool = values[-1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") pk.set_default_space(pk.ExecutionSpace.Cuda) y: pk.View1D = pk.View([N], pk.double) x: pk.View1D = pk.View([M], pk.double) A: pk.View2D = pk.View([N, M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) # if fill: # y.fill(1) # x.fill(1) # A.fill(1) # else: # for i in range(N): # y[i] = 1 # for i in range(M): # x[i] = 1 # for j in range(N): # for i in range(M): # A[j][i] = 1 timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution: float = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print( f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)" )
def run(self): printf("Initializing Views...\n") pk.parallel_for(self.dataCount, self.init_data) pk.parallel_for(self.indicesCount, self.init_indices) printf("Starting benchmarking...\n") pk.fence() timer = pk.Timer() for i in range(self.repeats): # FIXME: randomize indices # for i in range(self.indicesCount): # self.indices[i] = random.randrange(self.dataCount) if self.use_atomics: pk.parallel_for("gups", self.indicesCount, self.run_gups_atomic) else: pk.parallel_for("gups", self.indicesCount, self.run_gups) pk.fence() self.gupsTime = timer.seconds()
def run(self): pk.parallel_for( pk.MDRangePolicy([0, 0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init) pk.fence() timer = pk.Timer() for i in range(self.iterations): if self.permute: pk.parallel_for( "transpose", pk.MDRangePolicy([0, 0], [self.order, self.order], [self.tile_size, self.tile_size], rank=pk.Rank(2, pk.Iterate.Left, pk.Iterate.Right)), self.tranpose) else: pk.parallel_for( "transpose", pk.MDRangePolicy([0, 0], [self.order, self.order], [self.tile_size, self.tile_size], rank=pk.Rank(2, pk.Iterate.Right, pk.Iterate.Left)), self.tranpose) self.transpose_time = timer.seconds() self.abserr = pk.parallel_reduce( pk.MDRangePolicy([0, 0], [self.order, self.order], [self.tile_size, self.tile_size]), self.abserr_reduce) pk.printf("%f\n", self.abserr) episilon: float = 1.0e-8 if (self.abserr > episilon): pk.printf( "ERROR: aggregated squared error exceeds threshold %.2f\n", self.abserr) else: pk.printf("Solution validates %2.f\n", self.abserr)