def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 1 print(f"Total size S = {N * M} N = {N} M = {M}") y = pk.View([N], pk.double) x = pk.View([M], pk.double) A = pk.View([N * M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") p = pk.RangePolicy(pk.get_default_space(), 0, N) w = Workload(N, M) pk.parallel_for(p, w.y_init) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), w.x_init) pk.parallel_for(p, w.matrix_init) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
def run() -> None: random.seed(1010101) indices = 8192 data = 33554432 repeats = 10 space = pk.ExecutionSpace.OpenMP parser = argparse.ArgumentParser() parser.add_argument("--indices", type=int) parser.add_argument("--data", type=int) parser.add_argument("--repeats", type=int) parser.add_argument("--atomics", action="store_true") parser.add_argument("--execution_space", type=str) args = parser.parse_args() if args.indices: indices = args.indices if args.data: data = args.data if args.repeats: repeats = args.repeats use_atomics = args.atomics if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) w = Benchmark(indices, data, repeats, use_atomics) range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices) range_data = pk.RangePolicy(pk.get_default_space(), 0, data) print("Reports fastest timing per kernel") print("Creating Views...") print("Memory Sizes:") print(f"- Elements: {data} ({1e-6*data*8} MB)") print(f"- Indices: {indices} ({1e-6*indices*8} MB)") print(f"- Atomics: {'yes' if use_atomics else 'no'}") print(f"Benchmark kernels will be performed for {repeats} iterations") print("Initializing Views...") pk.parallel_for(range_data, w.init_data) pk.parallel_for(range_indices, w.init_indices) print("Starting benchmarking...") timer = pk.Timer() for i in range(repeats): for i in range(indices): w.indices[i] = random.randrange(data) if use_atomics: pk.parallel_for(range_indices, w.run_gups_atomic) else: pk.parallel_for(range_indices, w.run_gups) gupsTime = timer.seconds() print(f"GUP/s Random: {1e-9 * repeats * indices / gupsTime}") print(w.data)
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] fill: bool = values[-1] nrepeat: int = 100 print(f"Total size S = {N * M} N = {N} M = {M}") pk.set_default_space(pk.ExecutionSpace.Cuda) y: pk.View1D = pk.View([N], pk.double) x: pk.View1D = pk.View([M], pk.double) A: pk.View2D = pk.View([N, M], pk.double) p = pk.RangePolicy(pk.get_default_space(), 0, N) pk.parallel_for(p, y_init, y=y) pk.parallel_for(pk.RangePolicy(pk.get_default_space(), 0, M), y_init, y=x) pk.parallel_for(p, matrix_init, M=M, A=A) # if fill: # y.fill(1) # x.fill(1) # A.fill(1) # else: # for i in range(N): # y[i] = 1 # for i in range(M): # x[i] = 1 # for j in range(N): # for i in range(M): # A[j][i] = 1 timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print(f"Computed result for {N} x {M} is {result}") solution: float = N * M if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print( f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)" )
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] fill: bool = values[-1] nrepeat: int = 1000 print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") w = Workload(N, M, E, fill) p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space()) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, w.yAx) timer_result = timer.seconds() print(f"Computed result for {N} x {M} x {E} is {result}") solution: float = N * M * E if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print( f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)" )
def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] fill: bool = values[-1] nrepeat: int = 1000 print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") y: pk.View2D = pk.View([E, N], pk.double, layout=pk.Layout.LayoutRight) x: pk.View2D = pk.View([E, M], pk.double, layout=pk.Layout.LayoutRight) A: pk.View3D = pk.View([E, N, M], pk.double, layout=pk.Layout.LayoutRight) if fill: y.fill(1) x.fill(1) A.fill(1) else: for e in range(E): for i in range(N): y[e][i] = 1 for i in range(M): x[e][i] = 1 for j in range(N): for i in range(M): A[e][j][i] = 1 p = pk.TeamPolicy(E, "auto", 32, pk.get_default_space()) timer = pk.Timer() for i in range(nrepeat): result = pk.parallel_reduce(p, yAx, N=N, M=M, y=y, x=x, A=A) timer_result = timer.seconds() print( f"Computed result for {N} x {M} x {E} is {result}") solution: float = N * M * E if result != solution: pk.printf("Error: result (%lf) != solution (%lf)\n", result, solution) print(f"N({N}) M({M}) E({E}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)")
space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) N = args.N K = args.K D = args.D R = args.R U = args.U F = args.F scalar_size = 8 policy = pk.RangePolicy(pk.get_default_space(), 0, N) w = Benchmark_double_8(N, K, D, R, F) timer = pk.Timer() for r in range(R): pk.parallel_for(policy, w.benchmark) pk.fence() seconds = timer.seconds() num_bytes = 1.0 * N * K * R * (2 * scalar_size + 4) + N * R * scalar_size flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1)) gather_ops = 1.0 * N * K * R * 2 seconds = seconds print( f"SNKDRUF: {scalar_size/4} {N} {K} {D} {R} {U} {F} Time: {seconds} " +
team_acc += self.y[e][j] * tempM tempN: float = pk.parallel_reduce( pk.TeamThreadRange(team_member, self.N), team_reduce) def single_closure(): nonlocal acc acc += tempN pk.single(pk.PerTeam(team_member), single_closure) if __name__ == "__main__": values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] E: int = values[3] nrepeat: int = values[4] fill: bool = values[-1] space: str = values[-2] if space == "": space = pk.ExecutionSpace.OpenMP else: space = pk.ExecutionSpace(space) pk.set_default_space(space) print(f"Total size S = {N * M} N = {N} M = {M} E = {E}") pk.execute(pk.get_default_space(), Workload(N, M, E, nrepeat, fill))
indices = args.indices if args.data: data = args.data if args.repeats: repeats = args.repeats use_atomics = args.atomics if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) indices_view: pk.View1D[pk.int64] = pk.View([indices], pk.int64) data_view: pk.View1D[pk.int64] = pk.View([data], pk.int64) datum: pk.int64 = -1 range_indices = pk.RangePolicy(pk.get_default_space(), 0, indices) range_data = pk.RangePolicy(pk.get_default_space(), 0, data) print("Reports fastest timing per kernel") print("Creating Views...") print("Memory Sizes:") print(f"- Elements: {data} ({1e-6*data*8} MB)") print(f"- Indices: {indices} ({1e-6*indices*8} MB)") print(f"- Atomics: {'yes' if use_atomics else 'no'}") print(f"Benchmark kernels will be performed for {repeats} iterations") print("Initializing Views...") pk.parallel_for(range_data, init_data, data=data_view) pk.parallel_for(range_indices, init_indices, indices=indices_view) print("Starting benchmarking...")
@pk.workunit def init_A(self, j: int, i: int): self.A[j][i] = 1 @pk.workunit def yAx(self, j: int, acc: pk.Acc[float]): temp2: float = 0 for i in range(self.M): temp2 += self.A[j][i] * self.x[i] acc += self.y[j] * temp2 if __name__ == "__main__": values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] nrepeat: int = values[4] space: str = values[-2] if space == "": space = pk.ExecutionSpace.OpenMP else: space = pk.ExecutionSpace(space) pk.set_default_space(space) print(f"Total size S = {N * M} N = {N} M = {M}") pk.execute(pk.get_default_space(), Workload(N, M, nrepeat))
print("S must be 0 (shared scratch memory not supported)") exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) N = args.N K = args.K R = args.R U = args.U F = args.F T = args.T S = args.S scalar_size = 8 pk.set_default_space(space) r = pk.TeamPolicy(N, T, space=pk.get_default_space()) w = Benchmark_double_8(N, K, R, args.D, F, T, S) timer = pk.Timer() pk.parallel_for(r, w.benchmark) seconds = timer.seconds() num_bytes = 1.0 * N * K * R * 3 * scalar_size flops = 1.0 * N * K * R * (F * 2 * U + 2 * (U - 1)) print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " + f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}") print(w.C)
type=int, help="shared memory per team (used to control occupancy on GPUs)") parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() if args.P != 2: print("only support P=2") exit(1) if args.U != 8: print("only support U=8") exit(1) if args.D not in [1, 2, 4, 8, 16, 32]: print("D must be one of 1, 2, 4, 8, 16, 32") exit(1) if args.S != 0: print("S must be 0 (shared scratch memory not supported)") exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) args.N = 2**args.N pk.execute( pk.get_default_space(), Benchmark_double_8(args.N, args.K, args.R, args.D, args.F, args.T, args.S))
parser = argparse.ArgumentParser() parser.add_argument("S", type=int, help="Scalar Type Size (1==float, 2==double, 4==complex<double>)") parser.add_argument("N", type=int, help="Number of Entities") parser.add_argument("K", type=int, help="Number of things to gather per entity") parser.add_argument("D", type=int, help="Max distance of gathered things of an entity") parser.add_argument("R", type=int, help="how often to loop through the K dimension with each team") parser.add_argument("U", type=int, help="how many independent flops to do per load") parser.add_argument("F", type=int, help="how many times to repeat the U unrolled operations before reading next element") parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() if args.S != 2: print("only support S=2") exit(1) if args.U != 8: print("only support U=8") exit(1) if 2 ** args.N < args.D: print("N must be larger or equal to D") exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) n = 2 ** args.N pk.execute(pk.get_default_space(), Benchmark_double_8(n, args.K, args.D, args.R, args.F))
random.seed(1010101) indices = 8192 data = 33554432 repeats = 10 space = pk.ExecutionSpace.OpenMP parser = argparse.ArgumentParser() parser.add_argument("--indices", type=int) parser.add_argument("--data", type=int) parser.add_argument("--repeats", type=int) parser.add_argument("--atomics", action="store_true") parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() if args.indices: indices = args.indices indices = 2**indices if args.data: data = args.data data = 2**data if args.repeats: repeats = args.repeats use_atomics = args.atomics if args.execution_space: space = pk.ExecutionSpace(args.execution_space) pk.set_default_space(space) pk.execute(pk.get_default_space(), Benchmark(indices, data, repeats, use_atomics))