Python start_profilerの例、pycuda.driver.start_profiler Pythonの例

コード例 #1

0

ファイルを表示

ファイル: inference.py プロジェクト: landersson/misc

def benchmark(datasrc, model):
    start = time.time()
    label, data = datasrc.get_item()
    print("Data load time: %.2fms" % ((time.time() - start) * 1000.0))

    start = time.time()
    data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data,2), 0)).astype(model.dtype)
    data = model.normalize(data)
    print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0))

    input_tensor = GPUTensor(data)
    # warmup...
    for i in range(1):
         model.evaluate(input_tensor)
    start = time.time()
    num_iterations = 100
    print("Timing %d iterations..." % num_iterations)
    for i in range(num_iterations):
        if i == num_iterations - 1:
            drv.start_profiler()
        y = model.evaluate(input_tensor)
        print(y)
    drv.stop_profiler()

    et = (time.time() - start) * 1000 / num_iterations
    print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))

コード例 #2

0

ファイルを表示

def benchmark(datasrc, model):
    start = time.time()
    label, data = datasrc.get_item()
    print("Data load time: %.2fms" % ((time.time() - start) * 1000.0))

    start = time.time()
    data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data, 2),
                                               0)).astype(model.dtype)
    data = model.normalize(data)
    print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0))

    input_tensor = GPUTensor(data)
    # warmup...
    for i in range(1):
        model.evaluate(input_tensor)
    start = time.time()
    num_iterations = 100
    print("Timing %d iterations..." % num_iterations)
    for i in range(num_iterations):
        if i == num_iterations - 1:
            drv.start_profiler()
        y = model.evaluate(input_tensor)
        print(y)
    drv.stop_profiler()

    et = (time.time() - start) * 1000 / num_iterations
    print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))

コード例 #3

0

ファイルを表示

ファイル: pycuda_3dlut.py プロジェクト: toru-ver4/sip

def check_time_3dlut(grid_num=17, img_file='../Matrix/figure/src_img.png'):
    # profiler関連の設定
    # ----------------------------------------
    config_file = "./data/config.txt"
    log_file = "./data/profile_out.csv"
    output_mode = cuda.profiler_output_mode.KEY_VALUE_PAIR
    cuda.initialize_profiler(config_file, log_file, output_mode)

    # 3DLUTデータの作成
    # ----------------------------------------
    matrix_param = np.array([[0.2126, 0.7152, 0.0722],
                             [-0.114572, -0.385428, 0.5],
                             [0.5, -0.454153, -0.045847]])
    kwargs = {'mtx': matrix_param}
    lut = make_3dlut_data(grid_num=17, func=rgb2yuv_for_3dlut, **kwargs)

    # 3DLUTの適用
    # ----------------------------------------
    img = img_open_and_normalize(img_file)
    img_x86 = exec_3dlut_on_x86(img=img, lut=lut)
    cuda.start_profiler()
    img_gpu = exec_3dlut_on_gpu(img=img, lut=lut)
    cuda.stop_profiler()

    return img_x86, img_gpu

コード例 #4

0

ファイルを表示

ファイル: gpuFDMT.py プロジェクト: ajosephy/FDMT

 def execute(self):
     cuda.start_profiler()
     timestamp = time()
     for i,(f,a) in enumerate(zip(self.funcs,self.args)):
         f(*a)
     gpu_transpose.prepared_call(grid2,block2,A)
     gpu_get_sigma.prepared_call(grid3,block3,A)
     cuda.Context.synchronize()
     read_results()
     cuda.stop_profiler()
     print "Time for DM tranform, transpose, sigma calculate, get max sigma: %.3f s" % (time()-timestamp)

コード例 #5

0

ファイルを表示

ファイル: FrostTrap.py プロジェクト: khoste/PP-Assignment-3

def frost_trap(kernel_func, args, ver):
    """ Solves the FrostTrap on a GPU with CUDA

    frost_trap creates and allocates the trap data on the GPU, initialize
    graphics and nvidia profiler, runs the kernel function until the delta
    is smaller than a certain limit, finally, it dumps the execution time
    and parameters.

    Args:
            kernel_func : <simple|rb|dbuf> from kernel.cu
            args        : packed command-line options and arguments as a kwarg
    """

    verbose.info("create trap data and init visualizer")
    trap = gpuarray.to_gpu(create_trap(args.width, args.height))

    # delta   - total change
    # epsilon - minimum convergence limit
    epsilon = np.float32(0.001 * args.width * args.height)
    delta = np.array(epsilon)

    # init visualizer
    vis = Visualizer(trap, do_vis=args.graphic)

    # start nvidia profiler
    if args.profile:
        cuda.start_profiler()

    # start
    verbose.info("start")
    t1 = timeit.default_timer()
    
    while delta <= epsilon:
        # experiment with different block and grid sizes
        kernel_func(trap, args.height, args.width, args.omega, epsilon, args.iter, delta, block=(1, 300, 1), grid=(300,1,1))
        vis.update()

    t2 = timeit.default_timer()
    verbose.info("done")
    # stop

    # stop nvidia profiler
    if args.profile:
        cuda.stop_profiler()

    # dump result and parameters as json
    with open(args.result, "a+") as f:
        verbose.info("dumping results to {args.result}")
        args.secs = t2-t1
        json.dump(vars(args), f, default=lambda x: eval(str(x)), indent=2)

コード例 #6

0

ファイルを表示

def simulate_gpu():
    driver.start_profiler()
    particle_pointer = cuda.mem_alloc(NUM_PARTICLES * ParticleStruct.mem_size)

    particles = []
    for i in range(NUM_PARTICLES):
        p = ParticleStruct(
            numpy.random.randn(3).astype(numpy.float32),
            numpy.random.randn(3).astype(numpy.float32),
            int(particle_pointer) + i * ParticleStruct.mem_size)
        particles.append(p)

    print("GPU initial:", list(map(lambda p: str(p), particles)))
    func = mod.get_function("simulate")
    func(particle_pointer,
         numpy.int32(NUM_ITERATIONS),
         grid=(GRID_SIZE, 1),
         block=(BLOCK_SIZE, 1, 1))
    print("GPU result:", list(map(lambda p: str(p), particles)))
    driver.stop_profiler()

コード例 #7

0

ファイルを表示

def run_model(args, graph, inputs, outputs, data):
    # must use level0 to avoid unintended opr modification
    graph.options.graph_opt_level = 0

    logger.info("input tensors: ")
    for k, v in data.items():
        logger.info("  {}: {}".format(k, v.shape))

    G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args))

    if args.optimize_for_inference:
        opt_kwargs = get_opt_kwargs(args)
        outputs = G.optimize_for_inference(outputs, **opt_kwargs)

    # embed inputs must be on the last, to avoid const fold
    if args.embed_input:
        outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs)
    else:
        outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs)

    if args.dump_cpp_model:
        dump_content, _ = G.dump_graph(outputs, keep_var_name=2)
        with open(args.dump_cpp_model, "wb") as file:
            file.write(dump_content)
        logger.info("C++ model written to {}".format(args.dump_cpp_model))

    outputs, output_dict = tools.convert_outputs(outputs)

    if args.profile:
        profiler = tools.GraphProfiler(graph)

    func = graph.compile(outputs)

    def run():
        if not args.embed_input:
            for key in inp_dict:
                inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor())
        func.execute()
        func.wait()
        return [oup_node.get_value().numpy() for oup_node in output_dict.values()]

    if args.warm_up:
        logger.info("warming up")
        run()

    total_time = 0

    for i in range(args.iter):
        logger.info("iter {}".format(i))
        start_time = time.time()
        retval = run()
        cur_time = time.time() - start_time
        total_time += cur_time

        avg_speed = (i + 1) / total_time
        if "data" in data:
            avg_speed *= data["data"].shape[0]
            avg_speed_txt = "{:.3f}sample/s".format(avg_speed)
        else:
            avg_speed_txt = "{:.3f}batch/s".format(avg_speed)

        msg = (
            "iter {}: duration={:.4f}({:.4f})s average={:.4f}s "
            "avg_speed={} time={:.4f}s"
        ).format(
            i,
            cur_time,
            func.get_prev_exec_time(),
            total_time / (i + 1),
            avg_speed_txt,
            total_time,
        )
        if args.calc_output_rms:
            rms = []
            for v in retval:
                rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5)))
            msg += " output_rms=[{}]".format(", ".join(rms))
        if logger.level > logging.INFO:
            print(msg)
        else:
            logger.info(msg)

    if args.focused_nvprof:
        if get_device_count("gpu") < 1:
            logger.warning(
                "No cuda device detected. ``focused_nvprof`` will be ignored."
            )
        else:
            try:
                import pycuda.driver as D

                D.start_profiler()
                func.execute()
                func.wait()
                D.stop_profiler()
            except ImportError:
                logger.error("`focused_nvprof need pycuda`", exc_info=True)

    if args.profile:
        with open(args.profile, "w") as fout:
            fout.write(profiler.get())

    return avg_speed

コード例 #8

0

ファイルを表示

import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
import time

from pycuda.compiler import SourceModule

config_file = "cuda_config_file"
profile_file = "cuda_profile_file"
cuda.initialize_profiler(config_file, profile_file,
                         cuda.profiler_output_mode.KEY_VALUE_PAIR)

cuda.start_profiler()

mod = SourceModule("""
__global__ void transpose(float *a, float *b, const unsigned int m, const unsigned int n)
{
  int Row = blockIdx.y * blockDim.y + threadIdx.y;
  int Col = blockIdx.x * blockDim.x + threadIdx.x;

  if ((Row < m)&&(Col < n)){
    b[Col * m + Row] = a[Row * n + Col];
  }
}
""")

m = 10000
n = 10000

a = numpy.random.randn(m, n)

コード例 #9

0

ファイルを表示

ファイル: benchmarks.py プロジェクト: renlliang3/cuvarbase

 def profiled_func(*args, **kwargs):
     cuda.start_profiler()
     func(*args, **kwargs)
     cuda.stop_profiler()
     #pycuda.autoinit.context.detach()
     sys.exit()

コード例 #10

0

ファイルを表示

#import PYCUDA modules and libraries
from pycuda import driver, compiler, gpuarray, tools
import sys
#the following module is used to mark the time stamps
import time

# -- initialize the device
import pycuda.autoinit

config_file = "cuda_config_file"
profile_file = "cuda_profile_file"

driver.initialize_profiler(config_file, profile_file,
                           driver.profiler_output_mode.CSV)

driver.start_profiler()

############################
##CUDA KERNEL
###########################
kernel_code_template = """
//Transpose function
__global__ void matrix_transpose(unsigned int* a, const unsigned int M, const unsigned int N, unsigned int* y) {
 
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
 
    if(i<N && j<M){
      y[j+M*i] = a[i+N*j];
    }
}

コード例 #11

0

ファイルを表示

def counting_vowels_in_text(text):
    """Returns the number of vowels found in the text?"""

    mod = SourceModule("""
    __global__ void count_vowels(char *text, int *results, int text_size, int chunk_size, int threads_per_block, int blocks_per_grid)
    {
        int index = blockDim.x * blockIdx.x + threadIdx.x;

        int start = index * chunk_size;
        int end = ( index + 1 ) * chunk_size;

        end = min( end, text_size );
        if (end < start)
        {
            return; 
        }

        int i = 0; 

        start = start * 4;
        end = end * 4;
        
        for(i = start; i <= end; i++){

            if (text[i] == 'a' || text[i] == 'A' || text[i] == 'e' || text[i] == 'E' || text[i] == 'i' 
                || text[i] == 'I' || text[i] =='o' || text[i] =='O' || text[i] == 'u' || text[i] == 'U' || text[i] == 'y' || text[i] == 'Y')
            {
                results[i] = 1;
            }
        }

    }
    """)
    cuda.start_profiler()

    max_text_size_in_mb = 100

    text_chunks = []
    text_chunks_count = math.ceil(
        len(text) / (max_text_size_in_mb * (1024**2)))

    while (len(text) > (max_text_size_in_mb * 1024**2)):
        text_chunk = text[:math.ceil(len(text) / (text_chunks_count))]
        text2 = text[math.ceil(len(text) / (text_chunks_count)):]

        text = text2
        text_chunks.append(text_chunk)

    text_chunks.append(text)

    cumulative_results = 0
    for text_chunk in text_chunks:
        device_text = gpuarray.to_gpu(numpy.array([text_chunk], dtype=str))
        device_results = gpuarray.zeros(len(text_chunk) * 4, dtype=numpy.int32)

        chunk_size = 1000
        threads_per_block = 512
        blocks_per_grid = numpy.int(
            math.ceil(len(text_chunk) / (chunk_size * threads_per_block)))

        device_text_size = numpy.int32(len(text_chunk))

        function = mod.get_function("count_vowels")
        function(device_text,
                 device_results,
                 device_text_size,
                 numpy.int32(chunk_size),
                 numpy.int32(blocks_per_grid),
                 block=(threads_per_block, 1, 1),
                 grid=(blocks_per_grid, 1, 1))

        host_results = device_results.get()

        results = numpy.count_nonzero(host_results == 1)
        cumulative_results += results

    cuda.stop_profiler()
    return cumulative_results