def tune_gausstransform(): #setup test input size = numpy.int32(2000) ndim = numpy.int32(2) A = numpy.random.randn(size * ndim).astype(numpy.float64) B = numpy.random.randn(size * ndim).astype(numpy.float64) scale = numpy.float64(10.0) grad = numpy.zeros(size * ndim).astype(numpy.float64) cost = numpy.zeros(size).astype(numpy.float64) #time the reference function arguments = [cost, A, B, size, size, ndim, scale, grad] with open(get_kernel_path() + 'gausstransform_c.cpp', 'r') as f: kernel_string = f.read() tune_params = {"block_size_x": [1]} print("CPU timing") tune_kernel("time_GaussTransform", kernel_string, size, arguments, tune_params, lang="C", compiler_options=['-I' + get_kernel_path(), '-O3']) #tune the GPU function print("GPU timing") with open(get_kernel_path() + 'kernels.cu', 'r') as f: kernel_string = f.read() scale_sq = (scale * scale).astype(numpy.float64) arguments = [A, B, size, size, scale_sq, grad, cost] cp = ['-O3'] tune_params = {"block_size_x": [32, 64, 128, 256, 512, 1024]} kernel1 = tune_kernel("GaussTransform", kernel_string, size, arguments, tune_params, grid_div_x=[], compiler_options=cp) arguments = [numpy.zeros(1).astype(numpy.float64), cost, size, size, size] kernel2 = tune_kernel("reduce_cross_term", kernel_string, 1, arguments, tune_params, grid_div_x=[], compiler_options=cp) best_config1 = min(kernel1[0], key=lambda x: x['time']) best_config2 = min(kernel2[0], key=lambda x: x['time']) print("best GPU configuration, total time=", best_config1['time'] + best_config2['time']) print(best_config1) print(best_config2)
def tune_expdist(): tune_params = OrderedDict() tune_params["block_size_x"] = [32] #[2**i for i in range(5,10)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(4)] tune_params["tile_size_y"] = [2**i for i in range(4)] tune_params["use_shared_mem"] = [1] #[0, 1] #setup test input alloc_size = 22000 size = numpy.int32(20000) max_blocks = numpy.int32( numpy.ceil(size / float(numpy.amin(tune_params["block_size_x"]))) ) ndim = numpy.int32(2) A = numpy.random.randn(alloc_size*ndim).astype(numpy.float64) B = A+0.00001*numpy.random.randn(alloc_size*ndim).astype(numpy.float64) scale_A = numpy.absolute(0.01*numpy.random.randn(alloc_size).astype(numpy.float64)) scale_B = numpy.absolute(0.01*numpy.random.randn(alloc_size).astype(numpy.float64)) cost = numpy.zeros((max_blocks)).astype(numpy.float64) #tune the GPU function with open(get_kernel_path()+'kernels.cu', 'r') as f: kernel_string = f.read() arguments = [A, B, size, size, scale_A, scale_B, cost] cp = ['-O3'] grid_div_x = ["block_size_x", "tile_size_x"] kernel1, env = tune_kernel("ExpDist_column", kernel_string, size, arguments, tune_params, compiler_options=cp, grid_div_x=grid_div_x) devname = "".join(env["device_name"].split()) #device name without whitespace with open("expdist_column_" + devname + ".json", 'w') as fp: json.dump([kernel1, env], fp) best_config1 = min(kernel1, key=lambda x:x['time']) nblocks = numpy.int32( numpy.ceil(size / float(best_config1["block_size_x"]*best_config1["tile_size_x"])) ) tune_params = OrderedDict() tune_params["block_size_x"] = [32*i for i in range(1,33)] arguments = [numpy.zeros(1).astype(numpy.float64), cost, size, size, nblocks] kernel2 = tune_kernel("reduce_cross_term", kernel_string, 1, arguments, tune_params, grid_div_x=[], compiler_options=cp) best_config2 = min(kernel2[0], key=lambda x:x['time']) print("best GPU configuration, total time=", best_config1['time'] + best_config2['time']) print(best_config1) print(best_config2)
def tune_expdist(): device = 2 tune_params = OrderedDict() tune_params["block_size_x"] = [32] #[2**i for i in range(5,10)] tune_params["block_size_y"] = [2**i for i in range(6)] tune_params["tile_size_x"] = [2**i for i in range(4)] tune_params["tile_size_y"] = [2**i for i in range(4)] tune_params["use_shared_mem"] = [1] #[0, 1] #setup test input alloc_size = 22000 size = numpy.int32(20000) max_blocks = numpy.int32( numpy.ceil(size / float(numpy.amin(tune_params["block_size_x"]))) * numpy.ceil(size / float(numpy.amin(tune_params["block_size_y"])))) ndim = numpy.int32(2) A = numpy.random.randn(alloc_size * ndim).astype(numpy.float64) B = A + 0.00001 * numpy.random.randn(alloc_size * ndim).astype( numpy.float64) scale_A = numpy.absolute( 0.01 * numpy.random.randn(alloc_size).astype(numpy.float64)) scale_B = numpy.absolute( 0.01 * numpy.random.randn(alloc_size).astype(numpy.float64)) cost = numpy.zeros((max_blocks)).astype(numpy.float64) #time the reference function #arguments = [cost, A, B, size, size, ndim, scale_A, scale_B] #with open(get_kernel_path()+'expdist_c.cpp', 'r') as f: # kernel_string = f.read() #print("CPU timing") #tune_kernel("time_expdist", kernel_string, size, arguments, {"block_size_x": [1]}, # lang="C", compiler_options=['-I'+get_kernel_path(), '-O3'], device=2) #tune the GPU function print("GPU timing") with open(get_kernel_path() + 'kernels.cu', 'r') as f: kernel_string = f.read() arguments = [A, B, size, size, scale_A, scale_B, cost] cp = ['-O3'] grid_div_x = ["block_size_x", "tile_size_x"] grid_div_y = ["block_size_y", "tile_size_y"] kernel1 = tune_kernel("ExpDist", kernel_string, (size, size), arguments, tune_params, compiler_options=cp, grid_div_x=grid_div_x, grid_div_y=grid_div_y, device=2) with open("expdist.json", 'w') as fp: json.dump(kernel1, fp) best_config1 = min(kernel1[0], key=lambda x: x['time']) nblocks = numpy.int32( numpy.ceil(size / float( best_config1["block_size_x"] * best_config1["tile_size_x"])) * numpy.ceil(size / float( best_config1["block_size_y"] * best_config1["tile_size_y"]))) tune_params = OrderedDict() tune_params["block_size_x"] = [32 * i for i in range(1, 33)] arguments = [ numpy.zeros(1).astype(numpy.float64), cost, size, size, nblocks ] kernel2 = tune_kernel("reduce_cross_term", kernel_string, 1, arguments, tune_params, grid_div_x=[], compiler_options=cp, device=2) best_config2 = min(kernel2[0], key=lambda x: x['time']) print("best GPU configuration, total time=", best_config1['time'] + best_config2['time']) print(best_config1) print(best_config2)