def tune(): size = int(80e6) a = np.random.randn(size).astype(np.float32) b = np.random.randn(size).astype(np.float32) c = np.zeros_like(b) n = np.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["N"] = [size] tune_params["NTHREADS"] = [16, 8, 4, 2, 1] print("compile with ftn using intel on cray") result, env = tune_kernel("time_vector_add", "vector_add.F90", size, args, tune_params, lang="C", compiler="ftn") print("compile with gfortran") result, env = tune_kernel("time_vector_add", "vector_add.F90", size, args, tune_params, lang="C", compiler="gfortran") print("compile with pgfortran") result, env = tune_kernel("time_vector_add", "vector_add.F90", size, args, tune_params, lang="C", compiler="pgfortran") return result
def tune_expdist(): #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,10)] tune_params["block_size_x"] = [2**i for i in range(5,10)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(4)] tune_params["tile_size_y"] = [2**i for i in range(4)] tune_params["use_shared_mem"] = [0, 1] #setup test input alloc_size = 3000 size = numpy.int32(2000) max_blocks = numpy.int32( numpy.ceil(size / float(numpy.amin(tune_params["block_size_x"]))) * numpy.ceil(size / float(numpy.amin(tune_params["block_size_y"]))) ) ndim = numpy.int32(2) A = numpy.random.randn(alloc_size*ndim).astype(numpy.float64) B = A+0.00001*numpy.random.randn(alloc_size*ndim).astype(numpy.float64) scale_A = numpy.absolute(0.01*numpy.random.randn(alloc_size).astype(numpy.float64)) scale_B = numpy.absolute(0.01*numpy.random.randn(alloc_size).astype(numpy.float64)) cost = numpy.zeros((max_blocks)).astype(numpy.float64) #setup kernel with open('expdist.cu', 'r') as f: kernel_string = f.read() arguments = [A, B, size, size, scale_A, scale_B, cost] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] #tune using the Noodles runner for parallel tuning using 8 threads kernel1 = tune_kernel("ExpDist", kernel_string, (size, size), arguments, tune_params, grid_div_x=grid_div_x, grid_div_y=grid_div_y, num_threads=8, use_noodles=True, verbose=True) #dump the tuning results to a json file with open("expdist.json", 'w') as fp: json.dump(kernel1, fp) #get the number of blocks used by the best configuration in the first kernel best_config1 = min(kernel1[0], key=lambda x:x['time']) nblocks = numpy.int32( numpy.ceil(size / float(best_config1["block_size_x"]*best_config1["tile_size_x"])) * numpy.ceil(size / float(best_config1["block_size_y"]*best_config1["tile_size_y"])) ) #tunable parameters for the second kernel tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #tune the second kernel, again in parallel with 8 threads arguments = [numpy.zeros(1).astype(numpy.float64), cost, size, size, nblocks] kernel2 = tune_kernel("reduce_cross_term", kernel_string, 1, arguments, tune_params, grid_div_x=[], num_threads=8, use_noodles=True, verbose=True) best_config2 = min(kernel2[0], key=lambda x:x['time']) print("best GPU configuration, total time=", best_config1['time'] + best_config2['time']) print(best_config1) print(best_config2)
def tune(): with open('convolution.cu', 'r') as f: kernel_string = f.read() #setup tunable parameters tune_params = OrderedDict() tune_params["filter_height"] = [i for i in range(3,19,2)] tune_params["filter_width"] = [i for i in range(3,19,2)] tune_params["block_size_x"] = [16*i for i in range(1,65)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [i for i in range(1,11)] tune_params["tile_size_y"] = [i for i in range(1,11)] tune_params["use_padding"] = [0,1] #toggle the insertion of padding in shared memory tune_params["read_only"] = [0,1] #toggle using the read-only cache #limit the search to only use padding when its effective, and at least 32 threads in a block restrict = ["use_padding==0 or (block_size_x % 32 != 0)", "block_size_x*block_size_y >= 32"] #setup input and output dimensions problem_size = (4096, 4096) size = numpy.prod(problem_size) largest_fh = max(tune_params["filter_height"]) largest_fw = max(tune_params["filter_width"]) input_size = ((problem_size[0]+largest_fw-1) * (problem_size[1]+largest_fh-1)) #create input data output_image = numpy.zeros(size).astype(numpy.float32) input_image = numpy.random.randn(input_size).astype(numpy.float32) filter_weights = numpy.random.randn(largest_fh * largest_fw).astype(numpy.float32) #setup kernel arguments cmem_args = {'d_filter': filter_weights} args = [output_image, input_image, filter_weights] #tell the Kernel Tuner how to compute grid dimensions grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] #start tuning separable convolution (row) tune_params["filter_height"] = [1] tune_params["tile_size_y"] = [1] results_row = tune_kernel("convolution_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, cmem_args=cmem_args, verbose=True, restrictions=restrict) #start tuning separable convolution (col) tune_params["filter_height"] = tune_params["filter_width"][:] tune_params["file_size_y"] = tune_params["tile_size_x"][:] tune_params["filter_width"] = [1] tune_params["tile_size_x"] = [1] results_col = tune_kernel("convolution_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, cmem_args=cmem_args, verbose=True, restrictions=restrict) return results_row, results_col
def tune_correlate_full_kernel(kernel_name): with open(get_kernel_path()+'correlate_full.cu', 'r') as f: kernel_string = f.read() N = np.int32(1e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N, factor=1750.0) #setup kernel arguments row_idx = np.zeros(10).astype(np.int32) #not used in first kernel col_idx = np.zeros(10).astype(np.int32) #not used in first kernel prefix_sums = np.zeros(10).astype(np.int32) #not used in first kernel sums = np.zeros(N).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] #run the sums kernel once params = {"block_size_x": 256, "write_sums": 1} answer = run_kernel(kernel_name, kernel_string, problem_size, args, params) reference = [None for _ in range(len(args))] reference[3] = answer[3] sums = reference[3].astype(np.int32) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32 tune_params["write_sums"] = [1] tune_params["write_spm"] = [0] kernel_1 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True) #tune kernel #2 total_correlated_hits = sums.sum() print("total_correlated_hits", total_correlated_hits) print("density", total_correlated_hits/(float(N)*sliding_window_width)) col_idx = np.zeros(total_correlated_hits).astype(np.int32) row_idx = np.zeros(total_correlated_hits).astype(np.int32) prefix_sums = np.cumsum(sums).astype(np.int32) args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct] tune_params["write_sums"] = [0] tune_params["write_spm"] = [1] kernel_2 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True) return kernel_1, kernel_2
def tune_degrees_dense(): with open(get_kernel_path()+'degrees.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N) problem_size = (N,1) correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: qd_string = f.read() data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1}) correlations = data[0] sums = data[1] #partial sum of the # of correlated hits to hits later in time #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["window_width"] = [sliding_window_width] args = [sums, correlations, N] return tune_kernel("degrees_dense", kernel_string, problem_size, args, tune_params, verbose=True)
def test_sequential_runner_alt_block_size_names(): kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_dim_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ c, a, b, n = get_vector_add_args() args = [c, a, b, n] tune_params = {"block_dim_x": [128 + 64 * i for i in range(5)], "block_size_y": [1], "block_size_z": [1]} ref = (a+b).astype(np.float32) answer = [ref, None, None, None] block_size_names = ["block_dim_x"] result, _ = kernel_tuner.tune_kernel( "vector_add", kernel_string, int(n), args, tune_params, grid_div_x=["block_dim_x"], answer=answer, block_size_names=block_size_names) assert len(result) == len(tune_params["block_dim_x"])
def tune(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 10000000 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["block_size_x"] = [128+64*i for i in range(15)] result = tune_kernel("vector_add", kernel_string, size, args, tune_params) with open("vector_add.json", 'w') as fp: json.dump(result, fp) return result
def tune_minimum_degree(): with open(get_kernel_path()+'minimum_degree.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #tune params here tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["threshold"] = [3] max_blocks = int(np.ceil(N / float(max(tune_params["block_size_x"])))) #generate input data with an expected density of correlated hits correlations, sums = generate_large_correlations_table(N, sliding_window_width) row_idx, col_idx, prefix_sums = create_sparse_matrix(correlations, sums) #setup all kernel inputs minimum = np.zeros(max_blocks).astype(np.int32) num_nodes = np.zeros(max_blocks).astype(np.int32) #call the CUDA kernel args = [minimum, num_nodes, sums, row_idx, col_idx, prefix_sums, N] return tune_kernel("minimum_degree", kernel_string, problem_size, args, tune_params, verbose=True)
def test_noodles_runner(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 100 a = np.random.randn(size).astype(np.float32) b = np.random.randn(size).astype(np.float32) c = np.zeros_like(b) n = np.int32(size) args = [c, a, b, n] tune_params = {"block_size_x": [128+64*i for i in range(15)]} result, _ = kernel_tuner.tune_kernel( "vector_add", kernel_string, size, args, tune_params, use_noodles=True, num_threads=4) assert len(result) == len(tune_params["block_size_x"])
def tune(): problem_size = (4096, 4096) size = numpy.prod(problem_size) A = numpy.random.randn(*problem_size).astype(numpy.float32) B = numpy.random.randn(*problem_size).astype(numpy.float32) C = numpy.zeros_like(A) args = [C, A, B] tune_params = OrderedDict() tune_params["block_size_x"] = [16*2**i for i in range(3)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(4)] tune_params["tile_size_y"] = [2**i for i in range(4)] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] restrict = ["block_size_x==block_size_y*tile_size_y"] answer = [numpy.dot(A,B), None, None] return kernel_tuner.tune_kernel("matmul_kernel", "matmul.cl", problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, restrictions=restrict, verbose=True, answer=answer, atol=1e-3)
def tune_quadratic_difference_kernel(): with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input data with an expected density of correlated hits x,y,z,ct = generate_input_data(N) #setup kernel arguments correlations = np.zeros((sliding_window_width, N), 'uint8') sums = np.zeros(N).astype(np.int32) args = [correlations, sums, N, sliding_window_width, x, y, z, ct] #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32 tune_params["f_unroll"] = [i for i in range(1,20) if 1500/float(i) == 1500//i] #divisors of 1500 tune_params["tile_size_x"] = [2**i for i in range(5)] #powers of 2 tune_params["write_sums"] = [1] return tune_kernel("quadratic_difference_linear", kernel_string, problem_size, args, tune_params, verbose=True)
def tune_dense2sparse(): with open(get_kernel_path()+'dense2sparse.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) sliding_window_width = np.int32(1500) problem_size = (N, 1) #generate input correlations, sums = generate_large_correlations_table(N, sliding_window_width) #setup all kernel inputs prefix_sums = np.cumsum(sums).astype(np.int32) total_correlated_hits = np.sum(sums.sum()) row_idx = np.zeros(total_correlated_hits).astype(np.int32) col_idx = np.zeros(total_correlated_hits).astype(np.int32) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] #factors of 32 up to 1024 tune_params["window_width"] = [sliding_window_width] tune_params["use_shared"] = [0, 1] tune_params["f_unroll"] = [i for i in range(1,5) if 1500/float(i) == 1500//i] #divisors of 1500 #call the tuner args = [row_idx, col_idx, prefix_sums, correlations, N] return tune_kernel("dense2sparse_kernel", kernel_string, problem_size, args, tune_params, verbose=True)
def test_strategies(env): options = dict(popsize=5, max_fevals=15) for strategy in strategy_map: print(f"testing {strategy}") result, _ = kernel_tuner.tune_kernel(*env, strategy=strategy, strategy_options=options, verbose=False, cache=cache_filename, simulation_mode=True) assert len(result) > 0
def test_genetic_algorithm(env): options = dict(method="uniform", popsize=10, maxiter=2, mutation_change=1) result, _ = kernel_tuner.tune_kernel(*env, strategy="genetic_algorithm", strategy_options=options, verbose=True, cache=cache_filename, simulation_mode=True) assert len(result) > 0
def test_simulation_runner(env): cache_filename = os.path.dirname( os.path.realpath(__file__)) + "/test_cache_file.json" result, _ = kernel_tuner.tune_kernel(*env, cache=cache_filename, simulation_mode=True, verbose=True) tune_params = env[-1] assert len(result) == len(tune_params["block_size_x"])
def test_nvml_observer(env): nvmlobserver = NVMLObserver(["nvml_energy", "temperature"]) env[-1]["block_size_x"] = [128] result, _ = kernel_tuner.tune_kernel(*env, observers=[nvmlobserver]) assert "nvml_energy" in result[0] assert "temperature" in result[0] assert result[0]["temperature"] > 0
def tune(nodes, edges, elements, max_levels, max_tile, real_type, quiet=True): numpy_real_type = None if real_type == "float": numpy_real_type = numpy.float32 elif real_type == "double": numpy_real_type = numpy.float64 else: raise ValueError # Tuning and code generation parameters tuning_parameters = dict() tuning_parameters["int_type"] = ["unsigned_int", "int"] tuning_parameters["real_type"] = [real_type] tuning_parameters["max_levels"] = [str(max_levels)] tuning_parameters["block_size_x"] = [32 * i for i in range(1, 33)] tuning_parameters["tiling_x"] = [i for i in range(1, max_tile)] constraints = list() constraints.append("block_size_x * tiling_x <= max_levels") # Memory allocation and initialization fct_adf_h = numpy.random.randn(edges * max_levels).astype(numpy_real_type) fct_adf_h_control = numpy.copy(fct_adf_h) fct_plus = numpy.random.randn(nodes * max_levels).astype(numpy_real_type) fct_minus = numpy.random.randn(nodes * max_levels).astype(numpy_real_type) levels = numpy.zeros(elements).astype(numpy.int32) for element in range(0, elements): levels[element] = numpy.random.randint(3, max_levels) nodes_per_edge = numpy.zeros(edges * 2).astype(numpy.int32) elements_per_edge = numpy.zeros(edges * 2).astype(numpy.int32) for edge in range(0, edges): nodes_per_edge[edge * 2] = numpy.random.randint(1, nodes + 1) nodes_per_edge[(edge * 2) + 1] = numpy.random.randint(1, nodes + 1) elements_per_edge[edge * 2] = numpy.random.randint(1, elements + 1) elements_per_edge[(edge * 2) + 1] = numpy.random.randint( 0, elements + 1) arguments = [ numpy.int32(max_levels), levels, nodes_per_edge, elements_per_edge, fct_adf_h, fct_plus, fct_minus ] # Reference memory_bytes = reference(edges, nodes_per_edge, elements_per_edge, levels, max_levels, fct_adf_h_control, fct_plus, fct_minus, numpy_real_type) arguments_control = [None, None, None, None, fct_adf_h_control, None, None] # Tuning results, _ = tune_kernel("fct_ale_b3_horizontal", generate_code, "{} * block_size_x".format(edges), arguments, tuning_parameters, lang="CUDA", answer=arguments_control, restrictions=constraints, quiet=quiet) # Memory bandwidth for result in results: result["memory_bandwidth"] = memory_bytes / (result["time"] / 10**3) return results
def tune(nodes, max_levels, max_tile, real_type, quiet=True): numpy_real_type = None if real_type == "float": numpy_real_type = numpy.float32 elif real_type == "double": numpy_real_type = numpy.float64 else: raise ValueError # Tuning and code generation parameters tuning_parameters = dict() tuning_parameters["int_type"] = ["unsigned_int", "int"] tuning_parameters["real_type"] = [real_type] tuning_parameters["max_levels"] = [str(max_levels)] tuning_parameters["block_size_x"] = [32 * i for i in range(1, 33)] tuning_parameters["tiling_x"] = [i for i in range(1, max_tile)] constraints = list() constraints.append("block_size_x * tiling_x <= max_levels") # Memory allocation and initialization fct_low_order = numpy.random.randn(nodes * max_levels).astype(numpy_real_type) ttf = numpy.random.randn(nodes * max_levels).astype(numpy_real_type) fct_ttf_max = numpy.zeros(nodes * max_levels).astype(numpy_real_type) fct_ttf_min = numpy.zeros_like(fct_ttf_max).astype(numpy_real_type) fct_ttf_max_control = numpy.zeros_like(fct_ttf_max).astype(numpy_real_type) fct_ttf_min_control = numpy.zeros_like(fct_ttf_min).astype(numpy_real_type) levels = numpy.zeros(nodes).astype(numpy.int32) used_levels = 0 for node in range(0, nodes): levels[node] = numpy.random.randint(3, max_levels) used_levels = used_levels + (levels[node] - 1) arguments = [ numpy.int32(max_levels), fct_low_order, ttf, levels, fct_ttf_max, fct_ttf_min ] # Reference reference(nodes, levels, max_levels, fct_low_order, ttf, fct_ttf_max_control, fct_ttf_min_control) arguments_control = [ None, None, None, None, fct_ttf_max_control, fct_ttf_min_control ] # Tuning results, _ = tune_kernel("fct_ale_a1", generate_code, "{} * block_size_x".format(nodes), arguments, tuning_parameters, lang="CUDA", answer=arguments_control, restrictions=constraints, quiet=quiet) # Memory bandwidth memory_bytes = ((nodes * 4) + (used_levels * 4 * numpy.dtype(numpy_real_type).itemsize)) for result in results: result["memory_bandwidth"] = memory_bytes / (result["time"] / 10**3) return results
def tune(number_of_frequencies): N = 61 T = 20 K = 150 F = number_of_frequencies B = (N)*(N-1)//2 * T print('N', N, 'B', B, 'T', T, 'K', K, 'F', F) args = generate_input_data(B, N, T, K, F) problem_size = B tune_params = OrderedDict() tune_params['block_size_x'] = [2**i for i in range(5,10)] print("First call the reference kernel") ref = call_reference_kernel(N, B, T, K, F, args) answer = [None for _ in args] answer[-2] = ref[-2] tolerance = 1e-2 verbosity = False print("Next, we call the modified kernel, with (use_kernel = 1) and without (use_kernel = 0) the slave kernel") print("With slave kernel:") # tune_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu", # problem_size, args, {'block_size_x': [32], 'use_kernel': [1]}, compiler_options=cp, verbose=True, answer=answer, atol=tolerance) tune_params['use_kernel'] = [1] results, env = tune_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu", problem_size, args, tune_params, compiler_options=cp, verbose=verbosity, answer=answer, atol=tolerance) min_time_with_slave = min([item['time'] for item in results]) print("Without slave kernel:") tune_params['use_kernel'] = [0] results, env = tune_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu", problem_size, args, tune_params, compiler_options=cp, verbose=verbosity, answer=answer, atol=tolerance) min_time_without_slave = min([item['time'] for item in results]) return min_time_with_slave/min_time_without_slave
def tune_complex_and_flip(kernel_string, height, width, image, image2): """step 1 convert to complex data structure and flip pattern""" problem_size = (width, height) tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] tune_params["block_size_y"] = [2**i for i in range(6)] image_freq = np.zeros((height,width,2), dtype=np.float32) image2_freq = np.zeros((height,width,2), dtype=np.float32) args = [height, width, image_freq, image2_freq, image, image2] params = {"block_size_x": 32, "block_size_y": 16} output = run_kernel("toComplexAndFlip2", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"]) tune_kernel("toComplexAndFlip2", kernel_string, problem_size, args, tune_params, grid_div_y=["block_size_y"]) return output[2], output[3]
def tune_pnpoly(): #change to dir with source files because of includes in pnpoly_host.cu os.chdir(get_kernel_path()) with open('pnpoly_host.cu', 'r') as f: host_string = f.read() with open('pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, vertex_xy, size] tune_params = OrderedDict() #tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 tune_params["tile_size"] = [2**i for i in range(6)] tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy}) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string, problem_size, args, tune_params, grid_div_x=grid_div_x, answer=result, lang="C", verbose=True) return results, tune_params
def tune(elements, nodes, max_levels, max_tile, real_type, quiet=True): numpy_real_type = None if real_type == "float": numpy_real_type = numpy.float32 elif real_type == "double": numpy_real_type = numpy.float64 else: raise ValueError # Tuning and code generation parameters tuning_parameters = dict() tuning_parameters["int_type"] = ["unsigned_int", "int"] tuning_parameters["real_type"] = [real_type] tuning_parameters["max_levels"] = [str(max_levels)] tuning_parameters["block_size_x"] = [32 * i for i in range(1, 33)] tuning_parameters["tiling_x"] = [i for i in range(1, max_tile)] tuning_parameters["vector_size"] = [1, 2] constraints = list() constraints.append("block_size_x * tiling_x <= max_levels") # Memory allocation and initialization uv_rhs = numpy.zeros(elements * max_levels * 2).astype(numpy_real_type) uv_rhs_control = numpy.zeros_like(uv_rhs).astype(numpy_real_type) fct_ttf_max = numpy.random.randn(nodes * max_levels).astype(numpy_real_type) fct_ttf_min = numpy.random.randn(nodes * max_levels).astype(numpy_real_type) levels = numpy.zeros(elements).astype(numpy.int32) element_nodes = numpy.zeros(elements * 3).astype(numpy.int32) for element in range(0, elements): levels[element] = numpy.random.randint(3, max_levels) element_nodes[(element * 3)] = numpy.random.randint(1, nodes + 1) element_nodes[(element * 3) + 1] = numpy.random.randint(1, nodes + 1) element_nodes[(element * 3) + 2] = numpy.random.randint(1, nodes + 1) arguments = [ numpy.int32(max_levels), levels, element_nodes, uv_rhs, fct_ttf_max, fct_ttf_min ] # Reference memory_bytes = reference(elements, levels, max_levels, element_nodes, uv_rhs_control, fct_ttf_max, fct_ttf_min, real_type) arguments_control = [None, None, None, uv_rhs_control, None, None] # Tuning results, _ = tune_kernel("fct_ale_a2", generate_code, "{} * block_size_x".format(elements), arguments, tuning_parameters, lang="CUDA", answer=arguments_control, restrictions=constraints, quiet=quiet) # Memory bandwidth for result in results: result["memory_bandwidth"] = memory_bytes / (result["time"] / 10**3) return results
def test_random_sample(env): result, _ = kernel_tuner.tune_kernel(*env, strategy="random_sample", strategy_options={"fraction": 0.1}, cache=cache_filename, simulation_mode=True) # check that number of benchmarked kernels is 10% (rounded up) assert len(result) == 2 # check all returned results make sense for v in result: assert v['time'] > 0.0 and v['time'] < 1.0
def test_custom_observer(env): env[-1]["block_size_x"] = [128] class MyObserver(BenchmarkObserver): def get_results(self): return {"name": self.dev.name} result, _ = kernel_tuner.tune_kernel(*env, observers=[MyObserver()]) assert "name" in result[0] assert len(result[0]["name"]) > 0
def tune(): #set the number of points and the number of vertices size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 #allocate device mapped host memory and generate input data points = allocate(2 * size, numpy.float32) numpy.copyto(points, numpy.random.randn(2 * size).astype(numpy.float32)) bitmap = allocate(size, numpy.int32) numpy.copyto(bitmap, numpy.zeros(size).astype(numpy.int32)) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices) * 2.0 * numpy.pi)[::-1] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = allocate(2 * vertices, numpy.float32) numpy.copyto( vertex_xy, numpy.array(list(zip(vertex_x, vertex_y))).astype(numpy.float32).ravel()) #kernel arguments args = [bitmap, points, vertex_xy, size] #setup tunable parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32 * i for i in range(1, 32)] #multiple of 32 tune_params["tile_size"] = [1] + [2 * i for i in range(1, 11)] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] #tell the Kernel Tuner how to compute the grid dimensions from the problem_size grid_div_x = ["block_size_x", "tile_size"] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly_host", ['pnpoly_host.cu', 'pnpoly.cu'], problem_size, args, tune_params, grid_div_x=grid_div_x, lang="C", compiler_options=["-arch=sm_52"], verbose=True, log=logging.DEBUG) return results
def tune(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 10000000 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["block_size_x"] = [32 * i for i in range(1, 33)] # This example illustrates how to use metrics # metrics can be either specified as functions or using strings # metrics need to be OrderedDicts because we can compose # earlier defined metrics into new metrics metrics = OrderedDict() # This metrics is the well-known GFLOP/s performance metric # we can define the value of the metric using a function that accepts 1 argument # this argument is a dictionary with all the tunable parameters and benchmark results # the value of the metric is calculated directly after obtaining the benchmark results metrics["GFLOP/s"] = lambda p: (size / 1e9) / (p["time"] / 1000) # Alternatively you can specify the metric using strings # in these strings you can use the names of the kernel parameters and benchmark results # directly as they will be replaced by the tuner before evaluating this string metrics["GB/s"] = f"({size}*4*2/1e9) / (time/1000)" result = tune_kernel("vector_add", kernel_string, size, args, tune_params, metrics=metrics) with open("vector_add.json", 'w') as fp: json.dump(result, fp) return result
def tune(): with open('spmv.cu', 'r') as f: kernel_string = f.read() nrows = numpy.int32(128 * 1024) ncols = 64 * 1024 nnz = int(nrows * ncols * 0.001) #problem_size = (nrows, 1) problem_size = nrows #generate sparse matrix in CSR rows = numpy.asarray([0] + sorted(numpy.random.rand(nrows - 1) * nnz) + [nnz]).astype(numpy.int32) cols = (numpy.random.rand(nnz) * ncols).astype(numpy.int32) vals = numpy.random.randn(nnz).astype(numpy.float32) #input and output vector (y = matrix * x) x = numpy.random.randn(ncols).astype(numpy.float32) y = numpy.zeros(nrows).astype(numpy.float32) args = [y, rows, cols, vals, x, nrows] tune_params = OrderedDict() tune_params["block_size_x"] = [32 * i for i in range(1, 33)] tune_params["threads_per_row"] = [1, 32] tune_params["read_only"] = [0, 1] grid_div_x = ["block_size_x/threads_per_row"] #compute reference answer using scipy.sparse row_ind = list( chain.from_iterable([[i] * (rows[i + 1] - rows[i]) for i in range(nrows)])) matrix = csr_matrix((vals, (row_ind, cols)), shape=(nrows, ncols)) start = time.clock() expected_y = matrix.dot(x) end = time.clock() print("computing reference using scipy.sparse took: " + str(start - end / 1000.0) + " ms.") answer = [expected_y, None, None, None, None, None] return kernel_tuner.tune_kernel("spmv_kernel", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, verbose=True, answer=answer, atol=1e-4)
def tune_horizontal(kernel_string, image, height, width): args = [height, width, image] #use only one column of thread blocks problem_size = (1, height) grid_div_x = [] grid_div_y = ["block_size_y"] tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(6)] return tune_kernel("computeMeanHorizontally", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x)
def test_bayesian_optimization(env): for method in [ "poi", "ei", "lcb", "lcb-srinivas", "multi", "multi-advanced", "multi-fast" ]: print(method, flush=True) options = dict(popsize=5, max_fevals=10, method=method) result, _ = kernel_tuner.tune_kernel(*env, strategy="bayes_opt", strategy_options=options, verbose=True, cache=cache_filename, simulation_mode=True) assert len(result) > 0
def test_sequential_runner_not_matching_answer1(): kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ args = get_vector_add_args() answer = [args[1] + args[2]] tune_params = {"block_size_x": [128 + 64 * i for i in range(5)]} try: kernel_tuner.tune_kernel( "vector_add", kernel_string, args[-1], args, tune_params, method="diff_evo", verbose=True, answer=answer) print("Expected a TypeError to be raised") assert False except TypeError as expected_error: print(str(expected_error)) assert "The length of argument list and provided results do not match." == str(expected_error) except Exception: print("Expected a TypeError to be raised") assert False
def tune_vertical(kernel_string, image, height, width): args = [height, width, image] #only one row of thread-blocks is to be created problem_size = (width, 1) grid_div_x = ["block_size_x"] grid_div_y = [] tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(6)] return tune_kernel("computeMeanVertically", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x)
def test_sequential_runner_not_matching_answer2(): kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ args = get_vector_add_args() answer = [np.ubyte([12]), None, None, None] tune_params = {"block_size_x": [128 + 64 * i for i in range(5)]} try: kernel_tuner.tune_kernel( "vector_add", kernel_string, args[-1], args, tune_params, method="diff_evo", verbose=True, answer=answer) print("Expected a TypeError to be raised") assert False except TypeError as expected_error: print(str(expected_error)) assert "Element 0" in str(expected_error) except Exception: print("Expected a TypeError to be raised") assert False
def tune(): with open('convolution.cu', 'r') as f: kernel_string = f.read() filter_size = (17, 17) problem_size = (4096, 4096) size = numpy.prod(problem_size) border_size = (filter_size[0]//2*2, filter_size[1]//2*2) input_size = ((problem_size[0]+border_size[0]) * (problem_size[1]+border_size[1])) output = numpy.zeros(size).astype(numpy.float32) input = numpy.random.randn(input_size).astype(numpy.float32) filter = numpy.random.randn(filter_size[0]*filter_size[1]).astype(numpy.float32) cmem_args= {'d_filter': filter } args = [output, input, filter] tune_params = OrderedDict() tune_params["filter_width"] = [filter_size[0]] tune_params["filter_height"] = [filter_size[1]] tune_params["block_size_x"] = [16*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(1,6)] tune_params["tile_size_x"] = [2**i for i in range(3)] tune_params["tile_size_y"] = [2**i for i in range(3)] tune_params["use_padding"] = [0,1] #toggle the insertion of padding in shared memory tune_params["read_only"] = [0,1] #toggle using the read-only cache grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] #compute the answer using a naive kernel params = { "block_size_x": 16, "block_size_y": 16} tune_params["filter_width"] = [filter_size[0]] tune_params["filter_height"] = [filter_size[1]] results = kernel_tuner.run_kernel("convolution_naive", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"], grid_div_x=["block_size_x"]) #set non-output fields to None answer = [results[0], None, None] #start kernel tuning with correctness verification return kernel_tuner.tune_kernel("convolution_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, cmem_args=cmem_args, answer=answer)
def tune(): kernel_string = _rotation_kernel_source problem_size = (1024, 1024) x = 128 * numpy.ones(problem_size, dtype=numpy.float32) out = numpy.zeros(problem_size, dtype=numpy.uint8) tune_params = OrderedDict() tune_params["block_size_x"] = [ 16, 32 ] tune_params["block_size_y"] = [ 16, 32 ] tune_params["oldiw"] = [ problem_size[0] ] tune_params["oldih"] = [ problem_size[1] ] tune_params["newiw"] = [ problem_size[0] ] tune_params["newih"] = [ problem_size[1] ] args = [ numpy.float32(0.5), numpy.float32(20), out ] return kernel_tuner.tune_kernel("copy_texture_kernel", kernel_string, problem_size, args, tune_params, texmem_args = { 'tex': { 'array': x, 'address_mode': 'border' } })
def test_diff_evo(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ args = get_vector_add_args() tune_params = {"block_size_x": [128+64*i for i in range(5)]} result, _ = kernel_tuner.tune_kernel( "vector_add", kernel_string, args[-1], args, tune_params, method="diff_evo", verbose=True) print(result) assert len(result) > 0
def test_random_sample(): kernel_string = "float test_kernel(float *a) { return 1.0f; }" a = np.arange(4, dtype=np.float32) tune_params = {"block_size_x": range(1, 25)} print(tune_params) result, _ = kernel_tuner.tune_kernel( "test_kernel", kernel_string, (1, 1), [a], tune_params, sample_fraction=0.1) print(result) # check that number of benchmarked kernels is 10% (rounded up) assert len(result) == 3 # check all returned results make sense for v in result: assert v['time'] == 1.0
def test_random_sample(): kernel_string = "float test_kernel(float *a) { return 1.0f; }" a = np.arange(4, dtype=np.float32) tune_params = {"block_size_x": range(1, 25)} print(tune_params) result, _ = kernel_tuner.tune_kernel( "test_kernel", kernel_string, (1, 1), [a], tune_params, strategy="random_sample", strategy_options={"fraction": 0.1}) print(result) # check that number of benchmarked kernels is 10% (rounded up) assert len(result) == 3 # check all returned results make sense for v in result: assert v['time'] == 1.0
def tune(): size = int(72*1024*1024) a = np.random.randn(size).astype(np.float32) b = np.random.randn(size).astype(np.float32) c = np.zeros_like(b) n = np.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["N"] = [size] tune_params["block_size_x"] = [32, 64, 128, 256, 512] result, env = tune_kernel("time_vector_add", "vector_add_acc.F90", size, args, tune_params, lang="C", compiler="pgfortran", compiler_options=["-acc=verystrict", "-ta=tesla,lineinfo"]) return result
def tune_transpose(kernel_string, image, height, width): output = np.zeros((width, height), dtype=np.float32) args = [height, width, output, image] #tune the transpose kernel problem_size = (width, height) grid_div_x = ["block_size_x"] grid_div_y = ["block_size_y"] tune_params = OrderedDict() tune_params["block_size_x"] = [32 * i for i in range(1, 9)] tune_params["block_size_y"] = [2**i for i in range(6)] return tune_kernel("transpose", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x)
def tune(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 80000000 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["block_size_x"] = [128 + 64 * i for i in range(15)] nvmlobserver = NVMLObserver(["nvml_energy", "temperature"]) metrics = OrderedDict() metrics["GFLOPS/W"] = lambda p: (size / 1e9) / p["nvml_energy"] results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[nvmlobserver], metrics=metrics, iterations=32) with open("vector_add.json", 'w') as fp: json.dump(results, fp) return results
def tune(): with open('spmv.cu', 'r') as f: kernel_string = f.read() nrows = numpy.int32(128*1024) ncols = 64*1024 nnz = int(nrows*ncols*0.001) #problem_size = (nrows, 1) problem_size = nrows #generate sparse matrix in CSR rows = numpy.asarray([0]+sorted(numpy.random.rand(nrows-1)*nnz)+[nnz]).astype(numpy.int32) cols = (numpy.random.rand(nnz)*ncols).astype(numpy.int32) vals = numpy.random.randn(nnz).astype(numpy.float32) #input and output vector (y = matrix * x) x = numpy.random.randn(ncols).astype(numpy.float32) y = numpy.zeros(nrows).astype(numpy.float32) args = [y, rows, cols, vals, x, nrows] tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] tune_params["threads_per_row"] = [1, 32] tune_params["read_only"] = [0, 1] grid_div_x = ["block_size_x/threads_per_row"] #compute reference answer using scipy.sparse row_ind = list(chain.from_iterable([[i] * (rows[i+1]-rows[i]) for i in range(nrows)])) matrix = csr_matrix((vals, (row_ind, cols)), shape=(nrows, ncols)) start = time.clock() expected_y = matrix.dot(x) end = time.clock() print("computing reference using scipy.sparse took: " + str(start-end / 1000.0) + " ms.") answer = [expected_y, None, None, None, None, None] return kernel_tuner.tune_kernel("spmv_kernel", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, verbose=True, answer=answer, atol=1e-4)
def test_noodles_runner(): skip_if_no_cuda_device() if sys.version_info[0] < 3 or (sys.version_info[0] == 3 and sys.version_info[1] < 5): raise SkipTest("Noodles runner test requires Python 3.5 or newer") import importlib.util noodles_installed = importlib.util.find_spec("noodles") is not None if not noodles_installed: raise SkipTest("Noodles runner test requires Noodles") kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 100 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = {"block_size_x": [128 + 64 * i for i in range(15)]} result, _ = kernel_tuner.tune_kernel("vector_add", kernel_string, size, args, tune_params, use_noodles=True, num_threads=4) assert len(result) == len(tune_params["block_size_x"])
def tune(): size = 10000000 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["block_size_x"] = [128+64*i for i in range(15)] result = tune_kernel("vector_add", my_fancy_generator, size, args, tune_params, lang="OpenCL") with open("vector_add.json", 'w') as fp: json.dump(result, fp) return result
def test_random_sample(): kernel_string = "float test_kernel(float *a) { return 1.0f; }" a = numpy.array([1, 2, 3]).astype(numpy.float32) tune_params = {"block_size_x": range(1, 25)} print(tune_params) result, _ = kernel_tuner.tune_kernel("test_kernel", kernel_string, (1, 1), [a], tune_params, sample_fraction=0.1) print(result) #check that number of benchmarked kernels is 10% (rounded up) assert len(result) == 3 #check all returned results make sense for v in result: assert v['time'] == 1.0
def tune(): problem_size = (4096, 4096) size = numpy.prod(problem_size) A = numpy.random.randn(*problem_size).astype(numpy.float32) B = numpy.random.randn(*problem_size).astype(numpy.float32) C = numpy.zeros_like(A) args = [C, A, B] tune_params = OrderedDict() tune_params["block_size_x"] = [16 * 2**i for i in range(3)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(4)] tune_params["tile_size_y"] = [2**i for i in range(4)] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] restrict = ["block_size_x==block_size_y*tile_size_y"] answer = [numpy.dot(A, B), None, None] metrics = OrderedDict() metrics["GFLOP/s"] = lambda p: (2 * 4096**3 / 1e9) / (p["time"] / 1e3) res, env = kernel_tuner.tune_kernel("matmul_kernel", "matmul.cu", problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, restrictions=restrict, verbose=True, iterations=32, metrics=metrics) with open("matmul.json", 'w') as fp: json.dump(res, fp)
def tune(): with open('stencil.cl', 'r') as f: kernel_string = f.read() problem_size = (4096, 2048) size = numpy.prod(problem_size) x_old = numpy.random.randn(size).astype(numpy.float32) x_new = numpy.copy(x_old) args = [x_new, x_old] tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(6)] grid_div_x = ["block_size_x"] grid_div_y = ["block_size_y"] return kernel_tuner.tune_kernel("stencil_kernel", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, grid_div_y=grid_div_y, verbose = True)
def tune(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ size = 10000000 a = cp.random.randn(size).astype(cp.float32) b = cp.random.randn(size).astype(cp.float32) c = cp.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = dict() tune_params["block_size_x"] = [128 + 64 * i for i in range(15)] answer = [a + b, None, None, None] result = tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, verbose=True, lang="Cupy") with open("vector_add.json", 'w') as fp: json.dump(result, fp) return result
def tune_prefix_sum_kernel(): with open(get_kernel_path()+'prefixsum.cu', 'r') as f: kernel_string = f.read() N = np.int32(4.5e6) problem_size = (N, 1) #setup tuning parameters tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] max_blocks = np.ceil(N/float(max(tune_params["block_size_x"]))).astype(np.int32) x = np.ones(N).astype(np.int32) #setup kernel arguments prefix_sums = np.zeros(N).astype(np.int32) block_carry = np.zeros(max_blocks).astype(np.int32) args = [prefix_sums, block_carry, x, N] #tune only the first kernel that computes the thread block-wide prefix sums #and outputs the block carry values return tune_kernel("prefix_sum_block", kernel_string, problem_size, args, tune_params, verbose=True)
def test_diff_evo(): kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ args = get_vector_add_args() tune_params = {"block_size_x": [128 + 64 * i for i in range(5)]} result, _ = kernel_tuner.tune_kernel("vector_add", kernel_string, args[-1], args, tune_params, method="diff_evo", verbose=True) print(result) assert len(result) > 0
def tune(): with open('convolution.cl', 'r') as f: kernel_string = f.read() problem_size = (4096, 4096) size = numpy.prod(problem_size) input_size = ((problem_size[0]+16) * (problem_size[1]+16)) output = numpy.zeros(size).astype(numpy.float32) input = numpy.random.randn(input_size).astype(numpy.float32) filter = numpy.random.randn(17*17).astype(numpy.float32) args = [output, input, filter] tune_params = OrderedDict() tune_params["block_size_x"] = [16*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(3)] tune_params["tile_size_y"] = [2**i for i in range(3)] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] #compute the answer using a naive kernel params = { "block_size_x": 16, "block_size_y": 16 } results = kernel_tuner.run_kernel("convolution_naive", kernel_string, problem_size, args, params, grid_div_y=["block_size_y"], grid_div_x=["block_size_x"]) #set non-output fields to None answer = [results[0], None, None] #start kernel tuning with correctness verification return kernel_tuner.tune_kernel("convolution_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, answer=answer)
def tune(number_of_sources): N = 61 T = 20 #K = 150 K = number_of_sources F = 1 print('N', N, 'T', T, 'K', K, 'F', F) args = generate_input_data(N, T, K, F) problem_size = (T * K * F, N) ref = call_reference_kernel(N, T, K, F, args, cp) #print(ref[17][:20]) tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5, 11)] tune_params["use_kernel"] = [0] tune_params["use_shared_mem"] = [0, 1] #restrict = ["use_kernel == 0 or block_size_x<=64"] results, env = tune_kernel("kernel_tuner_host_array_beam", [get_kernel_path() + "predict_model.cu"], problem_size, args, tune_params, lang="C", compiler_options=cp, verbose=True, answer=ref, atol=1e-4) return results
def tune_pnpoly_kernel(): with open(get_kernel_path()+'pnpoly.cu', 'r') as f: kernel_string = f.read() size = numpy.int32(2e7) problem_size = (size, 1) vertices = 600 points = numpy.random.randn(2*size).astype(numpy.float32) bitmap = numpy.zeros(size).astype(numpy.int32) #as test input we use a circle with radius 1 as polygon and #a large set of normally distributed points around 0,0 vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1] points_x = points[::2] points_y = points[1::2] vertex_x = numpy.cos(vertex_seeds) vertex_y = numpy.sin(vertex_seeds) vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32) args = [bitmap, points, size] # (vk.x-vj.x) / (vk.y-vj.y) slopes = numpy.zeros(vertices).astype(numpy.float32) for i in range(len(slopes)): if i == 0: slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i]) else: slopes[i] = (vertex_x[i-1] - vertex_x[i]) / (vertex_y[i-1] - vertex_y[i]) cmem_args= {'d_vertices': vertex_xy, "d_slopes": slopes } tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(6,10)] #powers of two #tune_params["block_size_x"] = [32*i for i in range(1,32)] #multiple of 32 #tune_params["block_size_x"] = [256] #fixed size tune_params["tile_size"] = [2**i for i in range(6)] #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i] tune_params["between_method"] = [0, 1, 2, 3] tune_params["use_precomputed_slopes"] = [0, 1] tune_params["use_method"] = [0, 1] grid_div_x = ["block_size_x", "tile_size"] #compute a reference answer using naive kernel params = {"block_size_x": 512} result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string, problem_size, args, params, cmem_args=cmem_args) result = [result[0], None, None] #start tuning results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string, problem_size, args, tune_params, grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result) return results, tune_params
with open('transpose.cu', 'r') as f: kernel_string = f.read() width = 4096 height = 8192 problem_size = (width, height) size = numpy.prod(problem_size) A = numpy.random.randn(size).astype(numpy.float32) AT = numpy.zeros_like(A) args = [AT, A, width, height] tune_params = dict() tune_params["block_size_x"] = [16*2**i for i in range(3)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(4)] tune_params["tile_size_y"] = [2**i for i in range(4)] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] restrict = ["block_size_x*tile_size_x==block_size_y*tile_size_y"] kernel_tuner.tune_kernel("transpose_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, restrictions=restrict, verbose=True)
#pragma omp parallel num_threads(nthreads) { int offset = omp_get_thread_num()*chunk; for (int i = offset; i<offset+chunk && i<n; i++) { c[i] = a[i] + b[i]; } } return (float)((omp_get_wtime() - start)*1e3); } """ size = 72*1024*1024 a = numpy.random.randn(size).astype(numpy.float32) b = numpy.random.randn(size).astype(numpy.float32) c = numpy.zeros_like(b) n = numpy.int32(size) args = [c, a, b, n] tune_params = OrderedDict() tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32] tune_params["vecsize"] = [1, 2, 4, 8, 16] answer = [a+b, None, None, None] tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=['-O3'])
import numpy import kernel_tuner from collections import OrderedDict with open('convolution.cl', 'r') as f: kernel_string = f.read() problem_size = (4096, 4096) size = numpy.prod(problem_size) input_size = (problem_size[0]+16) * (problem_size[0]+16) output = numpy.zeros(size).astype(numpy.float32) input = numpy.random.randn(input_size).astype(numpy.float32) filter = numpy.random.randn(17*17).astype(numpy.float32) args = [output, input, filter] tune_params = OrderedDict() tune_params["block_size_x"] = [16*i for i in range(1,9)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(3)] tune_params["tile_size_y"] = [2**i for i in range(3)] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] kernel_tuner.tune_kernel("convolution_kernel", kernel_string, problem_size, args, tune_params, grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True)