def compute_energy(self, system: System, binning: Binning, neighbor: Neighbor) -> float: self.x = system.x self.id = system.id self.type = system.type self.N_local = system.N_local self.bin_count = binning.bincount self.bin_offsets = binning.binoffsets self.permute_vector = binning.permute_vector self.nhalo = binning.nhalo self.nbinx = binning.nbinx self.nbiny = binning.nbiny self.nbinz = binning.nbinz self.nbins: int = self.nbinx * self.nbiny * self.nbinz self.parallel_for = False pk.execute(self, dependencies=[t_scalar3]) self.x = t_x() self.type = t_type() self.f = t_f() return self.PE
def exchange(self) -> None: self.s = copy.copy(self.system) self.N_local = self.system.N_local self.bind_views() self.workunit_id = 0 pk.execute(pk.ExecutionSpace.Default, self)
def test_add_squares(self): expected_result: int = self.value * self.value * self.threads workload = Add1DSquareTestReduce(self.threads, self.value) pk.execute(pk.ExecutionSpace.Default, workload) result: int = workload.sum self.assertEqual(expected_result, result)
def compute(self, system: System, binning: Binning, neighbor: Neighbor, fill: bool) -> None: self.x = system.x self.f = system.f self.id = system.id self.type = system.type self.N_local = system.N_local self.step = self.step_i self.bin_count = binning.bincount self.bin_offsets = binning.binoffsets self.permute_vector = binning.permute_vector self.nhalo = binning.nhalo self.nbinx = binning.nbinx self.nbiny = binning.nbiny self.nbinz = binning.nbinz if fill: self.f.fill(0) else: for i in range(self.f.x): for j in range(self.f.y): self.f[i][j] = 0.0 self.nbins: int = self.nbinx * self.nbiny * self.nbinz self.parallel_for = True pk.execute(self) self.step_i += 1 self.x = t_x() self.type = t_type() self.f = t_f()
def run(): workload = Workload(10) pk.execute(pk.ExecutionSpace.Default, workload) print(workload.view) print(workload.permute_vector) print(workload.bin_offsets) print(workload.bin_count)
def final_integrate(self) -> None: workload = FinalIntegrateFunctor( self.system.v, self.system.f, self.system.type, self.system.mass, self.dtf, self.dtv, self.system.id, self.step, self.system.x, self.system.N_local) pk.execute(pk.ExecutionSpace.Default, workload) self.step += 1
def create_binning(self, dx_in: float, dy_in: float, dz_in: float, halo_depth: int, do_local: bool, do_ghost: bool, sort: bool) -> None: if do_local or do_ghost: self.nhalo = halo_depth range_min: int = 0 if do_local else self.system.N_local range_max: int = int( ((self.system.N_local + self.system.N_ghost) if do_ghost else self.system.N_local)) self.range_min = range_min self.range_max = range_max self.nbinx = int(self.system.sub_domain_x / dx_in) self.nbiny = int(self.system.sub_domain_y / dy_in) self.nbinz = int(self.system.sub_domain_z / dz_in) if self.nbinx == 0: self.nbinx = 1 if self.nbiny == 0: self.nbiny = 1 if self.nbinz == 0: self.nbinz = 1 dx: float = self.system.sub_domain_x / self.nbinx dy: float = self.system.sub_domain_y / self.nbiny dz: float = self.system.sub_domain_z / self.nbinz self.nbinx += 2 * halo_depth self.nbiny += 2 * halo_depth self.nbinz += 2 * halo_depth eps: float = dx / 1000 self.minx = -dx * halo_depth - eps + self.system.sub_domain_lo_x self.maxx = dx * halo_depth + eps + self.system.sub_domain_hi_x self.miny = -dy * halo_depth - eps + self.system.sub_domain_lo_y self.maxy = dy * halo_depth + eps + self.system.sub_domain_hi_y self.minz = -dz * halo_depth - eps + self.system.sub_domain_lo_z self.maxz = dz * halo_depth + eps + self.system.sub_domain_hi_z # Bind views self.x = self.system.x self.v = self.system.v self.f = self.system.f self.type = self.system.type self.id = self.system.id self.q = self.system.q # Views self.bincount: pk.View3D = self.t_bincount(self.nbinx, self.nbiny, self.nbinz, pk.int32) self.binoffsets: pk.View3D = self.t_binoffsets( self.nbinx, self.nbiny, self.nbinz, pk.int32) self.sort = sort self.permute_vector.resize(0, range_max - range_min) pk.execute(pk.ExecutionSpace.Default, self)
def update_halo(self) -> None: self.N_ghost = 0 self.s = copy.copy(self.system) for self.phase in range(0, 6): self.pack_indicies = self.pack_indicies_all[self.phase, :] self.bind_views() self.workunit_id = 2 self.update_threads = self.num_ghost[self.phase] pk.execute(pk.ExecutionSpace.Default, self) self.N_ghost += self.num_ghost[self.phase]
def test_add_for(self): initial_value: int = 5 added_value: int = 7 expected_result: int = initial_value + added_value workload = Add1DTestFor(self.threads, initial_value, added_value) pk.execute(pk.ExecutionSpace.Default, workload) for i in range(self.threads): result: int = workload.view[i] self.assertEqual(result, expected_result)
def compute(self, system: System) -> float: self.v = system.v self.mass = system.mass self.type = system.type self.N_local = system.N_local pk.execute(pk.ExecutionSpace.Default, self) dof: int = 3 * system.N - 3 factor: float = system.mvv2e / (1.0 * dof * system.boltz) self.comm.reduce_float(self.T, 1) return self.T * factor
def update_force(self) -> None: self.s = copy.copy(self.system) self.ghost_offsets[0] = self.s.N_local for self.phase in range(1, 6): self.ghost_offsets[self.phase] = self.ghost_offsets[ self.phase - 1] + self.num_ghost[self.phase - 1] for self.phase in range(5, -1, -1): self.pack_indicies = self.pack_indicies_all[self.phase, :] self.bind_views() self.workunit_id = 3 self.force_threads = self.num_ghost[self.phase] pk.execute(pk.ExecutionSpace.Default, self)
def compute(self, system: System) -> float: self.v = system.v self.mass = system.mass self.type = system.type self.N_local = system.N_local pk.execute(pk.ExecutionSpace.Default, self) self.v = t_v(0, 3) self.mass = t_mass(0) self.type = t_type(0) factor: float = 0.5 * system.mvv2e self.comm.reduce_float(self.KE, 1) return self.KE * factor
def compute(self, system: System, binning: Binning, neighbor: Neighbor) -> None: neigh_list: NeighList2D = neighbor.get_neigh_list() self.num_neighs_view: pk.View1D = neigh_list.num_neighs self.neighs_view: pk.View2D = neigh_list.neighs self.N_local = system.N_local self.x = system.x self.f = system.f self.f_a = system.f self.type = system.type self.id = system.id self.parallel_for = True pk.execute(pk.ExecutionSpace.Default, self) self.step += 1
def exchange_halo(self) -> None: self.N_local = self.system.N_local self.N_ghost = 0 self.s = copy.copy(self.system) for self.phase in range(6): self.pack_indicies = self.pack_indicies_all[self.phase, :] count: int = 0 self.pack_count[0] = 0 sub: int = 0 if self.phase % 2 == 1: sub = self.num_ghost[self.phase - 1] self.nparticles = self.N_local + self.N_ghost - sub self.bind_views() self.workunit_id = 1 pk.execute(pk.ExecutionSpace.Default, self) count = self.pack_count[0] redo: bool = False if self.N_local + self.N_ghost + count > self.s.x.extent(0): self.system.grow(self.N_local + int(self.N_ghost) + int(count)) self.s = copy.copy(self.system) redo = True if count > self.pack_indicies.extent(0): self.pack_indicies_all.resize(0, 6) self.pack_indicies_all.resize(1, int(count * 1.1)) self.pack_indicies = self.pack_indicies_all[self.phase, :] redo = True if redo: self.pack_count[0] = 0 self.workunit_id = 1 self.bind_views() pk.execute(pk.ExecutionSpace.Default, self) self.num_ghost[self.phase] = count self.N_ghost += count self.system.N_ghost = self.N_ghost
def test_real(self): pk.set_default_precision(pk.int32) view: pk.View1d = pk.View([self.threads]) self.assertTrue(view.dtype is pk.DataType.int32) self.assertTrue( pk.View._get_dtype_name(str(type(view.array))) == "int32") f = RealViewTestFunctor(view) w = RealViewTestWorkload(self.threads, view) pk.parallel_for(self.threads, f.pfor) pk.execute(pk.ExecutionSpace.Default, w) view.set_precision(pk.float) self.assertTrue(view.dtype is pk.DataType.float) self.assertTrue( pk.View._get_dtype_name(str(type(view.array))) == "float32") pk.parallel_for(self.threads, f.pfor) pk.execute(pk.ExecutionSpace.Default, w)
self.mat: pk.View2D[pk.int32] = pk.View([r, c], pk.int32) for i in range(r): self.mat[i] = list(range(c * i, c * (i + 1))) for row in self.mat: print(row) print(f"Initialized {r}x{c} array") @pk.main def run(self): pk.parallel_for(self.r, self.sum_row) self.total = pk.parallel_reduce(self.r, self.final_sum) @pk.callback def results(self): print("Total =", self.total) @pk.workunit def sum_row(self, i: int): for j in range(1, self.c): self.mat[i][0] += self.mat[i][j] @pk.workunit def final_sum(self, i: int, accumulator: pk.Acc[pk.double]): accumulator += self.mat[i][0] if __name__ == "__main__": pk.execute(pk.ExecutionSpace.OpenMP, MatrixSum(5, 10))
type=int, help="shared memory per team (used to control occupancy on GPUs)") parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() if args.P != 2: print("only support P=2") exit(1) if args.U != 8: print("only support U=8") exit(1) if args.D not in [1, 2, 4, 8, 16, 32]: print("D must be one of 1, 2, 4, 8, 16, 32") exit(1) if args.S != 0: print("S must be 0 (shared scratch memory not supported)") exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) args.N = 2**args.N pk.execute( pk.get_default_space(), Benchmark_double_8(args.N, args.K, args.R, args.D, args.F, args.T, args.S))
import pykokkos as pk @pk.workload class SimpleSpaces: def __init__(self, n): self.N: int = n self.sum: int = 0 self.a: pk.View2D[pk.int32] = pk.View([n, 3], pk.int32) for i in range(n): for j in range(3): self.a[i][j] = i * n + j @pk.main def run(self): self.sum = pk.parallel_reduce(self.N, self.reduction) @pk.callback def use_results(self): print(self.sum) @pk.workunit def reduction(self, i: int, acc: pk.Acc[pk.double]): acc += self.a[i][0] - self.a[i][1] + self.a[i][2] if __name__ == "__main__": pk.execute(pk.ExecutionSpace.OpenMP, SimpleSpaces(10))
import pykokkos as pk @pk.workload class HelloWorld: def __init__(self, n): self.N: int = n @pk.main def run(self): pk.parallel_for(self.N, lambda i: pk.printf("Hello from i = %i\n", i)) if __name__ == "__main__": pk.execute(pk.ExecutionSpace.OpenMP, HelloWorld(10))
parser.add_argument('length', type=int) parser.add_argument('offset', nargs='?', type=int, default=0) parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() iterations = args.iterations length = args.length offset = args.offset if iterations < 1: sys.exit("ERROR: iterations must be >= 1") if length <= 0: sys.exit("ERROR: vector length must be positive") # emulate cpp example if length <= 0: sys.exit("ERROR: offset must be nonnegative") if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) # pk.enable_uvm() length = 2**length print("Number of iterations = ", iterations) print("Vector length = ", length) print("Offset = ", offset) pk.execute(pk.ExecutionSpace.Default, main(iterations, length, offset))
permute = args.permute if iterations < 1: sys.exit("ERROR: iterations must be >= 1") if order <= 0: sys.exit("ERROR: Matrix Order must be greater than 0") elif order > 46340: sys.exit("ERROR: matrix dimension too large - overflow risk") # a negative tile size means no tiling of the local transpose if (tile_size <= 0): tile_size = order if permute != 0 and permute != 1: sys.exit("ERROR: permute must be 0 (no) or 1 (yes)") if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) # pk.enable_uvm() order = 2**order print("Number of iterations = ", iterations) print("Matrix order = ", order) print("Tile size = ", tile_size) print("Permute loops = ", "yes" if permute else "no") pk.execute(pk.ExecutionSpace.Default, main(iterations, order, tile_size, permute))
self.data[i] = random.randint(0, n) @pk.main def run(self): pk.parallel_for(self.N, self.findprimes) @pk.callback def results(self): for i in range(int(self.count[0])): print(int(self.result[i]), end=", ") print("\nFound", int(self.count[0]), "prime numbers in", self.N, "random numbers") @pk.workunit def findprimes(self, i: int): number: int = self.data[i] upper_bound: int = math.sqrt(number) + 1 is_prime: bool = not (number % 2 == 0) k: int = 3 idx: int = 0 while k < upper_bound and is_prime: is_prime = not (number % k == 0) k += 2 if is_prime: idx = self.count[0] = self.count[0] + 1 self.result[idx - 1] = number if __name__ == "__main__": pk.execute(pk.ExecutionSpace.OpenMP, SimpleAtomics(100))
parser = argparse.ArgumentParser() parser.add_argument("S", type=int, help="Scalar Type Size (1==float, 2==double, 4==complex<double>)") parser.add_argument("N", type=int, help="Number of Entities") parser.add_argument("K", type=int, help="Number of things to gather per entity") parser.add_argument("D", type=int, help="Max distance of gathered things of an entity") parser.add_argument("R", type=int, help="how often to loop through the K dimension with each team") parser.add_argument("U", type=int, help="how many independent flops to do per load") parser.add_argument("F", type=int, help="how many times to repeat the U unrolled operations before reading next element") parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() if args.S != 2: print("only support S=2") exit(1) if args.U != 8: print("only support U=8") exit(1) if 2 ** args.N < args.D: print("N must be larger or equal to D") exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) n = 2 ** args.N pk.execute(pk.get_default_space(), Benchmark_double_8(n, args.K, args.D, args.R, args.F))
elif n > 46340: sys.exit("ERROR: grid dimension too large - overflow risk") # default tile size for tiling of local transpose tile_size = 32 # if tile_size <= 0: # tile_size = n # if tile_size > n: # tile_size = n # stencil pattern star = False if (stencil == "grid") else True if (radius < 1) or (2 * radius + 1 > n): sys.exit("ERROR: Stencil radius negative or too large") if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) # pk.enable_uvm() n = 2**n print("Number of iterations = ", iterations) print("Grid size = ", n) print("Tile size = ", tile_size) print("Type of stencil = ", "star" if star else "grid") print("Radius of stencil = ", radius) pk.execute(pk.ExecutionSpace.Default, main(iterations, n, tile_size, star, radius))
import pykokkos as pk @pk.workload class SquareSum: def __init__(self, n): self.N: int = n self.total: int = 0 @pk.main def run(self): self.total = pk.parallel_reduce(self.N, self.squaresum) @pk.callback def results(self): print("Sum:", self.total) @pk.workunit def squaresum(self, i: int, acc: pk.Acc[pk.double]): acc += i * i if __name__ == "__main__": pk.execute(pk.ExecutionSpace.OpenMP, SquareSum(10))
@pk.workload class RandomSum: def __init__(self, n): self.N: int = n self.total: int = 0 self.a: pk.View1D[pk.int32] = pk.View([n], pk.int32) for i in range(self.N): self.a[i] = random.randint(0, 10) print("Initialized view:", self.a) @pk.main def run(self): self.total = pk.parallel_reduce(self.N, self.my_reduction) @pk.callback def results(self): print("Sum:", self.total) @pk.workunit def my_reduction(self, i: int, accumulator: pk.Acc[pk.int32]): accumulator += self.a[i] if __name__ == "__main__": n = 10 pk.execute(pk.ExecutionSpace.OpenMP, RandomSum(n))
team_acc += self.y[e][j] * tempM tempN: float = pk.parallel_reduce( pk.TeamThreadRange(team_member, self.N), team_reduce) def single_closure(): nonlocal acc acc += tempN pk.single(pk.PerTeam(team_member), single_closure) if __name__ == "__main__": values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] nrepeat: int = values[4] fill: bool = values[-1] space: str = values[-2] if space == "": space = pk.ExecutionSpace.OpenMP else: space = pk.ExecutionSpace(space) pk.set_default_space(space) print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") pk.execute(pk.get_default_space(), Workload(N, M, E, nrepeat, fill))
def __init__(self, N: int): self.N: int = N self.A: pk.View1D[pk.int32] = pk.View([N], pk.int32) self.result: int = 0 self.timer_result: float = 0 @pk.main def run(self): pk.parallel_for(self.N, lambda i: i, self.A) timer = pk.Timer() self.result = pk.parallel_scan(self.N, self.scan) self.timer_result = timer.seconds() @pk.callback def results(self): print(f"{self.A} total={self.result} time({self.timer_result})") @pk.workunit def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool): acc += self.A[i] if last_pass: self.A[i] = acc if __name__ == "__main__": pk.execute(pk.ExecutionSpace.OpenMP, Workload(10))
@pk.workload class Math: def __init__(self, n): self.N: int = n self.a: pk.View1D[pk.int32] = pk.View([n], pk.int32) for i in range(self.N): self.a[i] = math.sqrt(math.tau) print("Initialized view:", self.a) @pk.main def run(self): pk.parallel_for(self.N, self.my_calculation) @pk.callback def results(self): print("Results: ", self.a) @pk.workunit def my_calculation(self, i: int): pk.printf("Running index %d\n", i) self.a[i] += (math.cos(self.a[i]) + 2**i - math.pi / math.fabs(self.a[(i + 1) % self.N])) if __name__ == "__main__": n = 10 pk.execute(pk.ExecutionSpace.OpenMP, Math(n))
import pykokkos as pk @pk.workload class AddOne: def __init__(self, n): self.N: int = n self.a: pk.View1D[pk.int32] = pk.View([n], pk.int32) for i in range(self.N): self.a[i] = 2 print(f"Initialized view: [{self.a[0]}, ... repeats {n-1} times]") @pk.main def run(self): y: int = 1 pk.parallel_for(self.N, lambda i: self.a[i] + y, self.a) @pk.callback def results(self): print(f"Results: [{self.a[0]}, ... repeats {n-1} times]") if __name__ == "__main__": n = 100 * 1000 pk.execute(pk.ExecutionSpace.OpenMP, AddOne(n))