def benchmark(datasrc, model): start = time.time() label, data = datasrc.get_item() print("Data load time: %.2fms" % ((time.time() - start) * 1000.0)) start = time.time() data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data,2), 0)).astype(model.dtype) data = model.normalize(data) print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0)) input_tensor = GPUTensor(data) # warmup... for i in range(1): model.evaluate(input_tensor) start = time.time() num_iterations = 100 print("Timing %d iterations..." % num_iterations) for i in range(num_iterations): if i == num_iterations - 1: drv.start_profiler() y = model.evaluate(input_tensor) print(y) drv.stop_profiler() et = (time.time() - start) * 1000 / num_iterations print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))
def benchmark(datasrc, model): start = time.time() label, data = datasrc.get_item() print("Data load time: %.2fms" % ((time.time() - start) * 1000.0)) start = time.time() data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data, 2), 0)).astype(model.dtype) data = model.normalize(data) print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0)) input_tensor = GPUTensor(data) # warmup... for i in range(1): model.evaluate(input_tensor) start = time.time() num_iterations = 100 print("Timing %d iterations..." % num_iterations) for i in range(num_iterations): if i == num_iterations - 1: drv.start_profiler() y = model.evaluate(input_tensor) print(y) drv.stop_profiler() et = (time.time() - start) * 1000 / num_iterations print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))
def check_time_3dlut(grid_num=17, img_file='../Matrix/figure/src_img.png'): # profiler関連の設定 # ---------------------------------------- config_file = "./data/config.txt" log_file = "./data/profile_out.csv" output_mode = cuda.profiler_output_mode.KEY_VALUE_PAIR cuda.initialize_profiler(config_file, log_file, output_mode) # 3DLUTデータの作成 # ---------------------------------------- matrix_param = np.array([[0.2126, 0.7152, 0.0722], [-0.114572, -0.385428, 0.5], [0.5, -0.454153, -0.045847]]) kwargs = {'mtx': matrix_param} lut = make_3dlut_data(grid_num=17, func=rgb2yuv_for_3dlut, **kwargs) # 3DLUTの適用 # ---------------------------------------- img = img_open_and_normalize(img_file) img_x86 = exec_3dlut_on_x86(img=img, lut=lut) cuda.start_profiler() img_gpu = exec_3dlut_on_gpu(img=img, lut=lut) cuda.stop_profiler() return img_x86, img_gpu
def execute(self): cuda.start_profiler() timestamp = time() for i,(f,a) in enumerate(zip(self.funcs,self.args)): f(*a) gpu_transpose.prepared_call(grid2,block2,A) gpu_get_sigma.prepared_call(grid3,block3,A) cuda.Context.synchronize() read_results() cuda.stop_profiler() print "Time for DM tranform, transpose, sigma calculate, get max sigma: %.3f s" % (time()-timestamp)
def frost_trap(kernel_func, args, ver): """ Solves the FrostTrap on a GPU with CUDA frost_trap creates and allocates the trap data on the GPU, initialize graphics and nvidia profiler, runs the kernel function until the delta is smaller than a certain limit, finally, it dumps the execution time and parameters. Args: kernel_func : <simple|rb|dbuf> from kernel.cu args : packed command-line options and arguments as a kwarg """ verbose.info("create trap data and init visualizer") trap = gpuarray.to_gpu(create_trap(args.width, args.height)) # delta - total change # epsilon - minimum convergence limit epsilon = np.float32(0.001 * args.width * args.height) delta = np.array(epsilon) # init visualizer vis = Visualizer(trap, do_vis=args.graphic) # start nvidia profiler if args.profile: cuda.start_profiler() # start verbose.info("start") t1 = timeit.default_timer() while delta <= epsilon: # experiment with different block and grid sizes kernel_func(trap, args.height, args.width, args.omega, epsilon, args.iter, delta, block=(1, 300, 1), grid=(300,1,1)) vis.update() t2 = timeit.default_timer() verbose.info("done") # stop # stop nvidia profiler if args.profile: cuda.stop_profiler() # dump result and parameters as json with open(args.result, "a+") as f: verbose.info("dumping results to {args.result}") args.secs = t2-t1 json.dump(vars(args), f, default=lambda x: eval(str(x)), indent=2)
def simulate_gpu(): driver.start_profiler() particle_pointer = cuda.mem_alloc(NUM_PARTICLES * ParticleStruct.mem_size) particles = [] for i in range(NUM_PARTICLES): p = ParticleStruct( numpy.random.randn(3).astype(numpy.float32), numpy.random.randn(3).astype(numpy.float32), int(particle_pointer) + i * ParticleStruct.mem_size) particles.append(p) print("GPU initial:", list(map(lambda p: str(p), particles))) func = mod.get_function("simulate") func(particle_pointer, numpy.int32(NUM_ITERATIONS), grid=(GRID_SIZE, 1), block=(BLOCK_SIZE, 1, 1)) print("GPU result:", list(map(lambda p: str(p), particles))) driver.stop_profiler()
def run_model(args, graph, inputs, outputs, data): # must use level0 to avoid unintended opr modification graph.options.graph_opt_level = 0 logger.info("input tensors: ") for k, v in data.items(): logger.info(" {}: {}".format(k, v.shape)) G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args)) if args.optimize_for_inference: opt_kwargs = get_opt_kwargs(args) outputs = G.optimize_for_inference(outputs, **opt_kwargs) # embed inputs must be on the last, to avoid const fold if args.embed_input: outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs) else: outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs) if args.dump_cpp_model: dump_content, _ = G.dump_graph(outputs, keep_var_name=2) with open(args.dump_cpp_model, "wb") as file: file.write(dump_content) logger.info("C++ model written to {}".format(args.dump_cpp_model)) outputs, output_dict = tools.convert_outputs(outputs) if args.profile: profiler = tools.GraphProfiler(graph) func = graph.compile(outputs) def run(): if not args.embed_input: for key in inp_dict: inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor()) func.execute() func.wait() return [oup_node.get_value().numpy() for oup_node in output_dict.values()] if args.warm_up: logger.info("warming up") run() total_time = 0 for i in range(args.iter): logger.info("iter {}".format(i)) start_time = time.time() retval = run() cur_time = time.time() - start_time total_time += cur_time avg_speed = (i + 1) / total_time if "data" in data: avg_speed *= data["data"].shape[0] avg_speed_txt = "{:.3f}sample/s".format(avg_speed) else: avg_speed_txt = "{:.3f}batch/s".format(avg_speed) msg = ( "iter {}: duration={:.4f}({:.4f})s average={:.4f}s " "avg_speed={} time={:.4f}s" ).format( i, cur_time, func.get_prev_exec_time(), total_time / (i + 1), avg_speed_txt, total_time, ) if args.calc_output_rms: rms = [] for v in retval: rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5))) msg += " output_rms=[{}]".format(", ".join(rms)) if logger.level > logging.INFO: print(msg) else: logger.info(msg) if args.focused_nvprof: if get_device_count("gpu") < 1: logger.warning( "No cuda device detected. ``focused_nvprof`` will be ignored." ) else: try: import pycuda.driver as D D.start_profiler() func.execute() func.wait() D.stop_profiler() except ImportError: logger.error("`focused_nvprof need pycuda`", exc_info=True) if args.profile: with open(args.profile, "w") as fout: fout.write(profiler.get()) return avg_speed
import pycuda.driver as cuda import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy import time from pycuda.compiler import SourceModule config_file = "cuda_config_file" profile_file = "cuda_profile_file" cuda.initialize_profiler(config_file, profile_file, cuda.profiler_output_mode.KEY_VALUE_PAIR) cuda.start_profiler() mod = SourceModule(""" __global__ void transpose(float *a, float *b, const unsigned int m, const unsigned int n) { int Row = blockIdx.y * blockDim.y + threadIdx.y; int Col = blockIdx.x * blockDim.x + threadIdx.x; if ((Row < m)&&(Col < n)){ b[Col * m + Row] = a[Row * n + Col]; } } """) m = 10000 n = 10000 a = numpy.random.randn(m, n)
def profiled_func(*args, **kwargs): cuda.start_profiler() func(*args, **kwargs) cuda.stop_profiler() #pycuda.autoinit.context.detach() sys.exit()
#import PYCUDA modules and libraries from pycuda import driver, compiler, gpuarray, tools import sys #the following module is used to mark the time stamps import time # -- initialize the device import pycuda.autoinit config_file = "cuda_config_file" profile_file = "cuda_profile_file" driver.initialize_profiler(config_file, profile_file, driver.profiler_output_mode.CSV) driver.start_profiler() ############################ ##CUDA KERNEL ########################### kernel_code_template = """ //Transpose function __global__ void matrix_transpose(unsigned int* a, const unsigned int M, const unsigned int N, unsigned int* y) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int j = blockIdx.y * blockDim.y + threadIdx.y; if(i<N && j<M){ y[j+M*i] = a[i+N*j]; } }
def counting_vowels_in_text(text): """Returns the number of vowels found in the text?""" mod = SourceModule(""" __global__ void count_vowels(char *text, int *results, int text_size, int chunk_size, int threads_per_block, int blocks_per_grid) { int index = blockDim.x * blockIdx.x + threadIdx.x; int start = index * chunk_size; int end = ( index + 1 ) * chunk_size; end = min( end, text_size ); if (end < start) { return; } int i = 0; start = start * 4; end = end * 4; for(i = start; i <= end; i++){ if (text[i] == 'a' || text[i] == 'A' || text[i] == 'e' || text[i] == 'E' || text[i] == 'i' || text[i] == 'I' || text[i] =='o' || text[i] =='O' || text[i] == 'u' || text[i] == 'U' || text[i] == 'y' || text[i] == 'Y') { results[i] = 1; } } } """) cuda.start_profiler() max_text_size_in_mb = 100 text_chunks = [] text_chunks_count = math.ceil( len(text) / (max_text_size_in_mb * (1024**2))) while (len(text) > (max_text_size_in_mb * 1024**2)): text_chunk = text[:math.ceil(len(text) / (text_chunks_count))] text2 = text[math.ceil(len(text) / (text_chunks_count)):] text = text2 text_chunks.append(text_chunk) text_chunks.append(text) cumulative_results = 0 for text_chunk in text_chunks: device_text = gpuarray.to_gpu(numpy.array([text_chunk], dtype=str)) device_results = gpuarray.zeros(len(text_chunk) * 4, dtype=numpy.int32) chunk_size = 1000 threads_per_block = 512 blocks_per_grid = numpy.int( math.ceil(len(text_chunk) / (chunk_size * threads_per_block))) device_text_size = numpy.int32(len(text_chunk)) function = mod.get_function("count_vowels") function(device_text, device_results, device_text_size, numpy.int32(chunk_size), numpy.int32(blocks_per_grid), block=(threads_per_block, 1, 1), grid=(blocks_per_grid, 1, 1)) host_results = device_results.get() results = numpy.count_nonzero(host_results == 1) cumulative_results += results cuda.stop_profiler() return cumulative_results