Example #1
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = drv.SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        strm = drv.Stream()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), drv.In(a), drv.In(b),
                block=shape+(1,), stream=strm)
        strm.synchronize()

        self.assert_(la.norm(dest-a*b) == 0)
	def set_array(self, array, copy=True):
            """
                Sets the local array in the Surface to the first argument, array, a
                a numpy array.
                Parameters
                ----------
                array : 2D numpy array
                     Container defining the surface.
                copy: boolean
                     When True, will immediately copy local data to device.
            """
            self.local_array = drv.pagelocked_empty_like(array)
            self.bounds = array.shape
            if copy == True:
                  self.local_to_device()
 def set_array(self, array, copy=True):
     """
             Sets the local array in the Surface to the first argument, array, a
             a numpy array.
             Parameters
             ----------
             array : 2D numpy array
                  Container defining the surface.
             copy: boolean
                  When True, will immediately copy local data to device.
         """
     self.local_array = drv.pagelocked_empty_like(array)
     self.bounds = array.shape
     if copy == True:
         self.local_to_device()
Example #4
0
        def function(in_array):
            nz, ny, nx = in_array.shape
            out_array = driver.pagelocked_empty_like(
                in_array,
                mem_flags=driver.host_alloc_flags.DEVICEMAP
            )

            block_size = (32, 32, 1)
            grid_size = (-(-nx//32), -(-ny//32), 1)

            kernel(
                driver.Out(out_array),
                driver.In(in_array),
                np.int32(nx), np.int32(ny), np.int32(nz),
                grid=grid_size, block=block_size
            )

            return out_array
Example #5
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(drv.Out(dest),
                      a_gpu,
                      b_gpu,
                      block=shape + (1, ),
                      stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest - a * b) == 0
Example #6
0
    def run(self, array):
        shape = array.shape
        if len(shape) != 3:
            raise ValueError("only 3D array is allowed")
        
        nz, ny, nx = shape
        logger.debug("shape, nx={}, ny={}, nz={}".format(nx, ny, nz))

        self._prepare_workspace(array.dtype)

        locked_in_array = driver.pagelocked_empty_like(array)
        locked_in_array[:] = array

        locked_out_array = self._copy(locked_in_array)
        
        out_array = np.empty_like(locked_out_array)
        out_array[:] = locked_out_array
        
        return out_array
Example #7
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=np.float32)
        b = drv.pagelocked_zeros(shape, dtype=np.float32)
        a[:] = np.random.randn(*shape)
        b[:] = np.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0
def convert_validate_save(
        onnx_model_filename: str,
        golden_data_filename: 'Optional[str]' = '',
        atol: float = 1e-3,
        rtol: float = 1e-3,
        batch_size: int = 1,  #
        debug: bool = False,
        **kwargs) -> bool:
    r"""
        inference model in 'tensorrt'
        validate with given golden data
        save if accuracy passed
    """

    import numpy as np
    import pycuda.autoinit  # noqa: just import, no code check
    import pycuda.driver as cuda
    import tensorrt as trt

    trt_logger = trt.Logger(
        trt.Logger.VERBOSE if debug else trt.Logger.WARNING)
    builder = trt.Builder(trt_logger)
    network = builder.create_network()
    parser = trt.OnnxParser(network, trt_logger)

    logger.info('loading ONNX model: %s ...', onnx_model_filename)
    with open(onnx_model_filename, 'rb') as fp:
        onnx_model_proto_str = fp.read()
    success = parser.parse(onnx_model_proto_str)
    if not success:
        logger.error('model parsing failed:')
        for idx_error in range(parser.num_errors):
            logger.error('\t%s', parser.get_error(idx_error))
        return False
    logger.info('model parsing passed')

    workspace_size = kwargs.pop('workspace_size',
                                1024 * 1024 * 16)  # default to 1024*1024*16
    fp16_mode = kwargs.pop('fp16_mode', builder.platform_has_fast_fp16)
    int8_mode = kwargs.pop('int8_mode', builder.platform_has_fast_int8)

    builder.debug_sync = debug
    builder.fp16_mode = fp16_mode
    builder.max_batch_size = batch_size
    builder.max_workspace_size = workspace_size
    builder.refittable = False
    builder.strict_type_constraints = True

    logger.info('using batch_size: %d', builder.max_batch_size)
    logger.info('I/O type-shape info:')
    if int8_mode:
        default_range = (-127, +127)
        builder.int8_mode = int8_mode
        for layer in network:
            for idx_out in range(layer.num_outputs):
                var_out = layer.get_output(idx_out)
                var_out.set_dynamic_range(-127, +127)
        dynamic_ranges = kwargs.pop('io_dynamic_ranges', dict())
        for idx_inp in range(network.num_inputs):
            var_inp = network.get_input(idx_inp)
            dr_lo, dr_hi = dynamic_ranges.get(var_inp.name, default_range)
            var_inp.set_dynamic_range(dr_lo, dr_hi)
            logger.info('\t input %d (%12s): %s%s in [%d, %d]', idx_inp,
                        var_inp.name, var_inp.dtype, var_inp.shape, dr_lo,
                        dr_hi)
        for idx_out in range(network.num_outputs):
            var_out = network.get_output(idx_out)
            dr_lo, dr_hi = dynamic_ranges.get(var_out.name, default_range)
            var_out.set_dynamic_range(dr_lo, dr_hi)
            logger.info('\toutput %d (%12s): %s%s in [%d, %d]', idx_out,
                        var_out.name, var_out.dtype, var_out.shape, dr_lo,
                        dr_hi)
        # TODO: int8 calibrate
    else:
        for idx_inp in range(network.num_inputs):
            var_inp = network.get_input(idx_inp)
            logger.info('\t input %d (%12s): %s%s', idx_inp, var_inp.name,
                        var_inp.dtype, var_inp.shape)
        for idx_out in range(network.num_outputs):
            var_out = network.get_output(idx_out)
            logger.info('\toutput %d (%12s): %s%s', idx_out, var_out.name,
                        var_out.dtype, var_out.shape)

    # not exposed
#    builder.getNbDLACores() > 0
#    builder.allowGPUFallback(True)
#    builder.setDefaultDeviceType(kDLA)
#    builder.setDLACore(1)

    engine = builder.build_cuda_engine(network)
    if engine is None:
        logger.info('engine building failed')
        return False
    logger.info('engine building passed')

    #    globals().update(locals())

    if golden_data_filename:
        logger.info('using golden data %s', golden_data_filename)
        if golden_data_filename.endswith('.npz'):
            test_data = np.load(
                golden_data_filename,
                encoding='bytes',
                allow_pickle=True,
            )
            input_data = test_data['inputs'].tolist()
            output_data = test_data['outputs'].tolist()
        else:
            test_data = np.load(
                golden_data_filename,
                encoding='bytes',
                allow_pickle=True,
            ).tolist()
            input_data = test_data['inputs']
            output_data = test_data['outputs']

        input_data = flatten_dict(input_data)
        output_data = flatten_dict(output_data)
        #        input_names = input_data.keys()
        output_names = output_data.keys()
        logger.info('with %d inputs and %d outputs', len(input_data),
                    len(output_data))

        input_device_data = {
            name: cuda.to_device(value)
            for name, value in input_data.items()
        }
        output_device_data = {
            name: cuda.mem_alloc(value.nbytes)
            for name, value in output_data.items()
        }
        output_host_data = {
            name: cuda.pagelocked_empty_like(value)
            for name, value in output_data.items()
        }
        logger.info('data transfered to device')

        profiler = trt.Profiler()
        with engine.create_execution_context() as context:
            if debug:
                context.profiler = profiler
            stream = cuda.Stream()

            #            for name in input_names:
            #                cuda.memcpy_htod_async(
            #                        input_data[name], input_device_data[name],
            #                                       stream=stream)

            device_data = list(input_device_data.values()) + list(
                output_device_data.values())
            success = context.execute_async(batch_size,
                                            bindings=list(map(
                                                int, device_data)),
                                            stream_handle=stream.handle,
                                            input_consumed=None)
            if not success:
                logger.error('execution failed')
                return False

            for name in output_names:
                cuda.memcpy_dtoh_async(
                    output_host_data[name],
                    output_device_data[name],
                    stream=stream,
                )

            stream.synchronize()

        logger.info('execution passed')


#        output_host_data[name] = onnx2trt_inference(
#                onnx_model_filename, list(input_data.values()),
#                batch_size, workspace_size)[0]

# validate
    passed = True
    if golden_data_filename:
        for name in output_names:
            pr = output_host_data[name]
            gt = output_data[name]
            logger.info('testing on output %s ...', name)
            try:
                np.testing.assert_allclose(
                    pr,
                    gt,
                    rtol=rtol,
                    atol=atol,
                    equal_nan=False,
                    verbose=True,
                )
            except AssertionError as e:
                passed = False
                logger.error('failed: %s\n', e)
        logger.info('accuracy %spassed', '' if passed else 'not ')

    globals().update(locals())

    if passed:
        trt_engine_filename = onnx_model_filename[:-len('.onnx'
                                                        )] + '.bin'  # or .trt
        with open(trt_engine_filename, 'wb') as fp:
            fp.write(engine.serialize())
        logger.info('engine saved to %s', trt_engine_filename)

    return passed
Example #9
0
def trikmeans_gpu(data, clusters, iterations, return_times = 0):
    """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    K-means using triangle inequality algorithm and PyCuda
    Input arguments are the data, intial cluster values, and number of iterations to repeat.
    The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
        nPts = number of data points.
    The shape of clusters is (nDim, nClusters) 
    
    The return values are the updated clusters and labels for the data
    """

    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]


    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0
    
    usePageLockedMemory = 0

    if(nPts > 32768):
        useTextureForData = 0
    
    
    # block and grid sizes for the ccdist kernel (also for hdclosest)
    blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16))
    gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist
    
    #block and grid sizes for the init module
    threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16)
    #blocksize_init = min(512, threads_desired)
    blocksize_init = min(128, threads_desired)
    gridsize_init = 1 + (threads_desired - 1)/blocksize_init
    
    #block and grid sizes for the step3 module
    blocksize_step3 = blocksize_init
    if not useTextureForData:
        blocksize_step3 = min(256, blocksize_step3)
    gridsize_step3 = gridsize_init
    
    #block and grid sizes for the step4 module
    # Each block of threads will handle seqcount times the data
    # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements
    blocksize_step4 = 2
    while(blocksize_step4 < min(512, nPts)):
        blocksize_step4 *= 2
    maxblocks = 512
    seqcount_step4 = 1 + (nPts-1)/(blocksize_step4*maxblocks)
    gridsize_step4 = 1 + (nPts-1)/(seqcount_step4*blocksize_step4)
    
    blocksize_step4part2 = 1
    while(blocksize_step4part2 < gridsize_step4):
        blocksize_step4part2 *= 2
    
    
    
    #block and grid sizes for the calc_movement module
    for blocksize_calcm in range(32, 512, 32):
        if blocksize_calcm >= nClusters:
            break;
    gridsize_calcm = 1 + (nClusters-1)/blocksize_calcm
    
    #block and grid sizes for the step56 module
    blocksize_step56 = blocksize_init
    gridsize_step56 = gridsize_init
    
    
    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()
    
    mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters,
                                        blocksize_step4, seqcount_step4, gridsize_step4, 
                                        blocksize_step4part2, useTextureForData)

    ccdist = mod_ccdist.get_function("ccdist")
    calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
    init = mod_ccdist.get_function("init")
    step3 = mod_ccdist.get_function("step3")
    step4 = mod_ccdist.get_function("step4")
    step4part2 = mod_ccdist.get_function("step4part2")
    calc_movement = mod_ccdist.get_function("calc_movement")
    step56 = mod_ccdist.get_function("step56")
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2-t1


    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)
    
    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_ccdist.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:,:] = data;
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:,:] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)


    gpu_assignments = gpuarray.zeros((nPts,), np.int32)         # cluster assignment
    gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32)   # lower bounds on distance between 
                                                                # point and each cluster
    gpu_upper = gpuarray.zeros((nPts,), np.float32)             # upper bounds on distance between
                                                                # point and any cluster
    gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32)    # cluster-cluster distances
    gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32)    # half distance to closest
    gpu_hdClosest.fill(1.0e10)  # set to large value // **TODO**  get the acutal float max
    gpu_badUpper = gpuarray.zeros((nPts,), np.int32)   # flag to indicate upper bound needs recalc
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32);
    gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32);
    
    gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32)
    gpu_cluster_changed.fill(1)
    
    gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_step4), np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_step4,), np.int32)
    
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2-t1
    
    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    ccdist_time = 0.
    hdclosest_time = 0.
    init_time = 0.
    step3_time = 0.
    step4_time = 0.
    step56_time = 0.

    t1 = time.time()
    ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest,
             block = (blocksize_ccdist, 1, 1),
             grid = (gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    ccdist_time += t2-t1
    
    t1 = time.time()
    calc_hdclosest(gpu_ccdist, gpu_hdClosest,
            block = (blocksize_ccdist, 1, 1),
            grid = (gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    hdclosest_time += t2-t1
    
    t1 = time.time()
    if useTextureForData:
        init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, 
                gpu_lower, gpu_upper,
                block = (blocksize_init, 1, 1),
                grid = (gridsize_init, 1),
                texrefs=[texrefData])
    else:
        init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, 
                gpu_lower, gpu_upper,
                block = (blocksize_init, 1, 1),
                grid = (gridsize_init, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    init_time += t2-t1

    for i in range(iterations):
    
        if i>0:
            t1 = time.time()
            ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest,
                     block = (blocksize_ccdist, 1, 1),
                     grid = (gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            ccdist_time += t2-t1
            
            t1 = time.time()
            calc_hdclosest(gpu_ccdist, gpu_hdClosest,
                    block = (blocksize_ccdist, 1, 1),
                    grid = (gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            hdclosest_time += t2-t1

            
        t1 = time.time()
        if i > 0:
            gpu_cluster_changed.fill(0)
        if useTextureForData:
            step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments,
                    gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed,
                    block = (blocksize_step3, 1, 1),
                    grid = (gridsize_step3, 1),
                    texrefs=[texrefData])
        else:
            step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments,
                    gpu_lower, gpu_upper, gpu_badUpper,  gpu_cluster_changed,
                    block = (blocksize_step3, 1, 1),
                    grid = (gridsize_step3, 1))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step3_time += t2-t1
        
        
        t1 = time.time()
        
        if useTextureForData:
            step4(gpu_cluster_changed, gpu_reduction_out,
                gpu_reduction_counts, gpu_assignments,
                block = (blocksize_step4, 1, 1),
                grid = (gridsize_step4, nDim),
                texrefs=[texrefData])
        else:
            step4(gpu_data, gpu_cluster_changed, gpu_reduction_out,
                gpu_reduction_counts, gpu_assignments,
                block = (blocksize_step4, 1, 1),
                grid = (gridsize_step4, nDim))
        
        step4part2(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, 
                gpu_clusters2, gpu_clusters,
                block = (blocksize_step4part2, 1, 1),
                grid = (1, nDim))
        
        calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, gpu_cluster_changed,
                block = (blocksize_calcm, 1, 1),
                grid = (gridsize_calcm, 1))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step4_time += t2-t1
    
        t1 = time.time()
        if useTextureForData:
            step56(gpu_assignments, gpu_lower, gpu_upper, 
                    gpu_cluster_movement, gpu_badUpper,
                    block = (blocksize_step56, 1, 1),
                    grid = (gridsize_step56, 1),
                    texrefs=[texrefData])
        else:
            step56(gpu_assignments, gpu_lower, gpu_upper, 
                    gpu_cluster_movement, gpu_badUpper,
                    block = (blocksize_step56, 1, 1),
                    grid = (gridsize_step56, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step56_time += t2-t1
        
        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp
        
    if return_times:
        return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \
                gpu_clusters.get(), gpu_cluster_movement, \
                data_time, module_time, init_time, \
                ccdist_time/iterations, hdclosest_time/iterations, \
                step3_time/iterations, step4_time/iterations, step56_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()
Example #10
0
def trikmeans_gpu(data, clusters, iterations, return_times=0):
    """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    K-means using triangle inequality algorithm and PyCuda
    Input arguments are the data, intial cluster values, and number of iterations to repeat.
    The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
        nPts = number of data points.
    The shape of clusters is (nDim, nClusters) 
    
    The return values are the updated clusters and labels for the data
    """

    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]

    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0

    usePageLockedMemory = 0

    if (nPts > 32768):
        useTextureForData = 0

    # block and grid sizes for the ccdist kernel (also for hdclosest)
    blocksize_ccdist = min(512, 16 * (1 + (nClusters - 1) / 16))
    gridsize_ccdist = 1 + (nClusters - 1) / blocksize_ccdist

    #block and grid sizes for the init module
    threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16)
    #blocksize_init = min(512, threads_desired)
    blocksize_init = min(128, threads_desired)
    gridsize_init = 1 + (threads_desired - 1) / blocksize_init

    #block and grid sizes for the step3 module
    blocksize_step3 = blocksize_init
    if not useTextureForData:
        blocksize_step3 = min(256, blocksize_step3)
    gridsize_step3 = gridsize_init

    #block and grid sizes for the step4 module
    # Each block of threads will handle seqcount times the data
    # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements
    blocksize_step4 = 2
    while (blocksize_step4 < min(512, nPts)):
        blocksize_step4 *= 2
    maxblocks = 512
    seqcount_step4 = 1 + (nPts - 1) / (blocksize_step4 * maxblocks)
    gridsize_step4 = 1 + (nPts - 1) / (seqcount_step4 * blocksize_step4)

    blocksize_step4part2 = 1
    while (blocksize_step4part2 < gridsize_step4):
        blocksize_step4part2 *= 2
    """
    print "blocksize_step4 =", blocksize_step4
    print "gridsize_step4 =", gridsize_step4
    print "seqcount_step4 =", seqcount_step4
    """

    #block and grid sizes for the calc_movement module
    for blocksize_calcm in range(32, 512, 32):
        if blocksize_calcm >= nClusters:
            break
    gridsize_calcm = 1 + (nClusters - 1) / blocksize_calcm

    #block and grid sizes for the step56 module
    blocksize_step56 = blocksize_init
    gridsize_step56 = gridsize_init

    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()

    mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters, blocksize_step4,
                                        seqcount_step4, gridsize_step4,
                                        blocksize_step4part2,
                                        useTextureForData, BOUNDS)

    ccdist = mod_ccdist.get_function("ccdist")
    calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
    init = mod_ccdist.get_function("init")
    step3 = mod_ccdist.get_function("step3")
    step4 = mod_ccdist.get_function("step4")
    step4part2 = mod_ccdist.get_function("step4part2")
    calc_movement = mod_ccdist.get_function("calc_movement")
    step56 = mod_ccdist.get_function("step56")
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2 - t1

    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)

    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_ccdist.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:, :] = data
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:, :] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)

    gpu_assignments = gpuarray.zeros((nPts, ), np.int32)  # cluster assignment
    gpu_lower = gpuarray.zeros((nClusters, nPts),
                               np.float32)  # lower bounds on distance between
    # point and each cluster
    gpu_upper = gpuarray.zeros((nPts, ),
                               np.float32)  # upper bounds on distance between
    # point and any cluster
    gpu_ccdist = gpuarray.zeros((nClusters, nClusters),
                                np.float32)  # cluster-cluster distances
    gpu_hdClosest = gpuarray.zeros((nClusters, ),
                                   np.float32)  # half distance to closest
    gpu_hdClosest.fill(
        1.0e10)  # set to large value // **TODO**  get the acutal float max
    gpu_badUpper = gpuarray.zeros(
        (nPts, ), np.int32)  # flag to indicate upper bound needs recalc
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32)
    gpu_cluster_movement = gpuarray.zeros((nClusters, ), np.float32)

    gpu_cluster_changed = gpuarray.zeros((nClusters, ), np.int32)
    gpu_cluster_changed.fill(1)

    gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_step4),
                                       np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_step4, ),
                                          np.int32)

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2 - t1

    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    ccdist_time = 0.
    hdclosest_time = 0.
    init_time = 0.
    step3_time = 0.
    step4_time = 0.
    step56_time = 0.

    t1 = time.time()
    ccdist(gpu_clusters,
           gpu_ccdist,
           gpu_hdClosest,
           block=(blocksize_ccdist, 1, 1),
           grid=(gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    ccdist_time += t2 - t1

    t1 = time.time()
    calc_hdclosest(gpu_ccdist,
                   gpu_hdClosest,
                   block=(blocksize_ccdist, 1, 1),
                   grid=(gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    hdclosest_time += t2 - t1

    t1 = time.time()
    if useTextureForData:
        init(gpu_clusters,
             gpu_ccdist,
             gpu_hdClosest,
             gpu_assignments,
             gpu_lower,
             gpu_upper,
             block=(blocksize_init, 1, 1),
             grid=(gridsize_init, 1),
             texrefs=[texrefData])
    else:
        init(gpu_data,
             gpu_clusters,
             gpu_ccdist,
             gpu_hdClosest,
             gpu_assignments,
             gpu_lower,
             gpu_upper,
             block=(blocksize_init, 1, 1),
             grid=(gridsize_init, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    init_time += t2 - t1

    for i in range(iterations):

        if i > 0:
            t1 = time.time()
            ccdist(gpu_clusters,
                   gpu_ccdist,
                   gpu_hdClosest,
                   block=(blocksize_ccdist, 1, 1),
                   grid=(gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            ccdist_time += t2 - t1

            t1 = time.time()
            calc_hdclosest(gpu_ccdist,
                           gpu_hdClosest,
                           block=(blocksize_ccdist, 1, 1),
                           grid=(gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            hdclosest_time += t2 - t1

        t1 = time.time()
        if i > 0:
            gpu_cluster_changed.fill(0)
        if useTextureForData:
            step3(gpu_clusters,
                  gpu_ccdist,
                  gpu_hdClosest,
                  gpu_assignments,
                  gpu_lower,
                  gpu_upper,
                  gpu_badUpper,
                  gpu_cluster_changed,
                  block=(blocksize_step3, 1, 1),
                  grid=(gridsize_step3, 1),
                  texrefs=[texrefData])
        else:
            step3(gpu_data,
                  gpu_clusters,
                  gpu_ccdist,
                  gpu_hdClosest,
                  gpu_assignments,
                  gpu_lower,
                  gpu_upper,
                  gpu_badUpper,
                  gpu_cluster_changed,
                  block=(blocksize_step3, 1, 1),
                  grid=(gridsize_step3, 1))

        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step3_time += t2 - t1

        t1 = time.time()

        if useTextureForData:
            step4(gpu_cluster_changed,
                  gpu_reduction_out,
                  gpu_reduction_counts,
                  gpu_assignments,
                  block=(blocksize_step4, 1, 1),
                  grid=(gridsize_step4, nDim),
                  texrefs=[texrefData])
        else:
            step4(gpu_data,
                  gpu_cluster_changed,
                  gpu_reduction_out,
                  gpu_reduction_counts,
                  gpu_assignments,
                  block=(blocksize_step4, 1, 1),
                  grid=(gridsize_step4, nDim))

        step4part2(gpu_cluster_changed,
                   gpu_reduction_out,
                   gpu_reduction_counts,
                   gpu_clusters2,
                   gpu_clusters,
                   block=(blocksize_step4part2, 1, 1),
                   grid=(1, nDim))

        calc_movement(gpu_clusters,
                      gpu_clusters2,
                      gpu_cluster_movement,
                      gpu_cluster_changed,
                      block=(blocksize_calcm, 1, 1),
                      grid=(gridsize_calcm, 1))

        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step4_time += t2 - t1

        t1 = time.time()
        if useTextureForData:
            step56(gpu_assignments,
                   gpu_lower,
                   gpu_upper,
                   gpu_cluster_movement,
                   gpu_badUpper,
                   block=(blocksize_step56, 1, 1),
                   grid=(gridsize_step56, 1),
                   texrefs=[texrefData])
        else:
            step56(gpu_assignments,
                   gpu_lower,
                   gpu_upper,
                   gpu_cluster_movement,
                   gpu_badUpper,
                   block=(blocksize_step56, 1, 1),
                   grid=(gridsize_step56, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step56_time += t2 - t1

        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp

    if return_times:
        return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \
                gpu_clusters.get(), gpu_cluster_movement, \
                data_time, module_time, init_time, \
                ccdist_time/iterations, hdclosest_time/iterations, \
                step3_time/iterations, step4_time/iterations, step56_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()
def main():
    parser = argparse.ArgumentParser(description="running inference ")
    parser.add_argument("-e",
                        "--engine",
                        type=str,
                        help="location of runtime engine")
    parser.add_argument("-n",
                        "--num_params",
                        default=300,
                        type=int,
                        help="number of transformation parameters")
    parser.add_argument("-o",
                        "--output",
                        default="results/output_0.gif",
                        help="The serialized engine file, ex inference.engine")
    parser.add_argument("-s",
                        "--source_image",
                        required=True,
                        help="The source image to animate")
    parser.add_argument("-f",
                        "--fps",
                        default=10,
                        help="fps parameter for the generated gif",
                        type=int)
    parser.add_argument(
        "--store_frames",
        action='store_true',
        help="store generated PNGs instead of generating a gif")
    parser.add_argument("-si",
                        "--size",
                        default=[256, 256],
                        action="append",
                        type=int,
                        help="spatial size of input image")
    args = parser.parse_args()

    frames = []
    source_image = extract_numpy_image_from_filelike(args.source_image)

    morpher_params, rotator_params = gen_params(args.num_params)

    identity = torch.Tensor([[1, 0, 0], [0, 1,
                                         0]]).unsqueeze(0).repeat(1, 1, 1)
    base_grid = F.affine_grid(identity, [1, 4, args.size[-2], args.size[-1]],
                              align_corners=True)
    base_grid = np.array(base_grid, dtype=np.float32)

    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    #Load Custom plugin libraries
    ctypes.CDLL("./gridSamplerPlugin/libgridsampler.so",
                mode=ctypes.RTLD_GLOBAL)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')

    with open(args.engine, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

        print("Loaded engine:{}".format(args.engine))

        context = engine.create_execution_context()
        context.debug_sync = True

        context.active_optimization_profile = 0
        print("Active Optimization Profile:{}".format(
            context.active_optimization_profile))

        input_binding_idxs, output_binding_idxs = get_binding_idxs(
            engine, context.active_optimization_profile)
        input_names = [
            engine.get_binding_name(binding_idx)
            for binding_idx in input_binding_idxs
        ]
        output_names = [
            engine.get_binding_name(binding_idx)
            for binding_idx in output_binding_idxs
        ]
        # Allocate device memory for inputs. This can be easily re-used if the
        # input shapes don't change
        host_inputs = [
            source_image, morpher_params[0], base_grid, rotator_params[0],
            rotator_params[0]
        ]
        host_inputs_buffers = [
            cuda.pagelocked_empty_like(i) for i in host_inputs
        ]
        [np.copyto(i, j) for i, j in zip(host_inputs_buffers, host_inputs)]
        device_inputs = [
            cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs
        ]

        input_binding_idxs, output_binding_idxs = get_binding_idxs(
            engine, context.active_optimization_profile)
        input_names = [
            engine.get_binding_name(binding_idx)
            for binding_idx in input_binding_idxs
        ]
        output_names = [
            engine.get_binding_name(binding_idx)
            for binding_idx in output_binding_idxs
        ]

        for h_input, d_input in zip(host_inputs_buffers, device_inputs):
            cuda.memcpy_htod(d_input, h_input)
        host_outputs, device_outputs = setup_binding_shapes(
            engine, context, host_inputs, input_binding_idxs,
            output_binding_idxs)
        print("Input Metadata")
        print("\tNumber of Inputs: {}".format(len(input_binding_idxs)))
        print("\tInput Bindings for Profile {}: {}".format(
            context.active_optimization_profile, input_binding_idxs))
        print("\tInput names: {}".format(input_names))
        print("\tInput shapes: {}".format([inp.shape for inp in host_inputs]))

        print("Output Metadata")
        print("\tNumber of Outputs: {}".format(len(output_binding_idxs)))
        print("\tOutput names: {}".format(output_names))
        print("\tOutput shapes: {}".format([out.shape
                                            for out in host_outputs]))
        print("\tOutput Bindings for Profile {}: {}".format(
            context.active_optimization_profile, output_binding_idxs))
        stream = cuda.Stream()
        for i in tqdm(range(args.num_params)):
            if i > 0:
                host_outputs, device_outputs = setup_binding_shapes(
                    engine, context, host_inputs, input_binding_idxs,
                    output_binding_idxs)
                [
                    np.copyto(host_inputs_buffers[1], morpher_params[i]),
                    np.copyto(host_inputs_buffers[3], rotator_params[i]),
                    np.copyto(host_inputs_buffers[4], rotator_params[i])
                ]

                cuda.memcpy_htod_async(device_inputs[1],
                                       host_inputs_buffers[1], stream)
                cuda.memcpy_htod_async(device_inputs[3],
                                       host_inputs_buffers[3], stream)
                cuda.memcpy_htod_async(device_inputs[4],
                                       host_inputs_buffers[4], stream)

            # Bindings are a list of device pointers for inputs and outputs
            bindings = device_inputs + device_outputs
            bindings = [int(binding) for binding in bindings]

            # Inference
            context.execute_async_v2(bindings, stream.handle)

            # Copy outputs back to host to view results
            cuda.memcpy_dtoh_async(host_outputs[-1], device_outputs[-1],
                                   stream)
            stream.synchronize()
            # View outputs
            combined = np.reshape(host_outputs[-1],
                                  (4, args.size[-2], args.size[-1]))
            frames.append(combined)

        del context
        del engine
        print("\n Generating GIF ....")
        final_frames = [rgba_to_numpy_image(i) for i in frames]
        pil_images = [
            Image.fromarray(np.uint8(np.rint(f * 255.0)), mode='RGBA')
            for f in final_frames
        ]
        pil_images[0].save(args.output,
                           save_all=True,
                           append_images=pil_images[1:],
                           optimize=False,
                           duration=200,
                           loop=1,
                           transparency=255,
                           disposal=2)
        print("\n Done !")
        if args.store_frames:
            if not os.path.isdir("results/frames"):
                os.mkdir("results/frames")
            [
                pil_image.save("results/frames/result_{}.png".format(i))
                for i, pil_image in enumerate(pil_images)
            ]
Example #12
0
def kmeans_gpu(data, clusters, iterations, return_times = 0):
    # kmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    # kmeans using standard algorithm and cuda
    # input arguments are the data, intial cluster values, and number of iterations to repeat
    # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
    # nPts = number of data points
    # The shape of clusters is (nDim, nClusters) 
    #
    # The return values are the updated clusters and labels for the data
    
    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]


    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0
    usePageLockedMemory = 0
    if(nPts > 32768):
        useTextureForData = 0
    
    # block and grid sizes for the cluster_assign kernel
    threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16)
    blocksize_assign = min(256, threads_desired)
    gridsize_assign = 1 + (threads_desired - 1)/blocksize_assign
    
    """
    print "\nblocksize_assign =", blocksize_assign
    print "gridsize_assign  =", gridsize_assign
    """
    
    # block and grid sizes for the cluster_calc kernel
    blocksize_calc = 2
    while(blocksize_calc < min(512, nPts)):
        blocksize_calc *= 2
    maxblocks = 512
    seqcount_calc = 1 + (nPts-1)/(blocksize_calc * maxblocks)
    gridsize_calc = 1 + (nPts-1)/(seqcount_calc * blocksize_calc)
    
    """
    print "blocksize_calc =", blocksize_calc
    print "gridsize_calc  =", gridsize_calc
    print "seqcount_calc  =", seqcount_calc
    """
    
    blocksize_calc_part2 = 1
    while(blocksize_calc_part2 < gridsize_calc):
        blocksize_calc_part2 *= 2


    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()
    
    mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters,
                                        blocksize_calc, seqcount_calc, gridsize_calc, 
                                        blocksize_calc_part2, useTextureForData, BOUNDS)

    cuda_assign = mod_cuda.get_function("assign")
    cuda_calc = mod_cuda.get_function("calc")
    cuda_calc_part2 = mod_cuda.get_function("calc_part2")

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2-t1

    
    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)
    
    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_cuda.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:,:] = data;
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:,:] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)


    gpu_assignments = gpuarray.zeros((nPts,), np.int32)
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32);
    gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_calc), 
                                                                np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_calc,), 
                                                                np.int32)
    
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2-t1


    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    assign_time = 0.
    calc_time = 0.


    for i in range(iterations):
    
        # assign data to clusters
        t1 = time.time()
        if useTextureForData:
            cuda_assign(gpu_clusters, gpu_assignments,
                        block = (blocksize_assign, 1, 1),
                        grid = (gridsize_assign, 1),
                        texrefs=[texrefData])
        else:
            cuda_assign(gpu_data, gpu_clusters, gpu_assignments,
                         block = (blocksize_assign, 1, 1),
                         grid = (gridsize_assign, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        assign_time += t2-t1

        # calculate new cluster centers
        t1 = time.time()
        if useTextureForData:
            cuda_calc(gpu_reduction_out, gpu_reduction_counts, gpu_assignments,
                block = (blocksize_calc, 1, 1),
                grid = (gridsize_calc, nDim),
                texrefs=[texrefData])
        else:
            cuda_calc(gpu_data, gpu_reduction_out, gpu_reduction_counts, 
                gpu_assignments,
                block = (blocksize_calc, 1, 1),
                grid = (gridsize_calc, nDim))
        
        cuda_calc_part2(gpu_reduction_out, gpu_reduction_counts, 
                gpu_clusters2, gpu_clusters,
                block = (blocksize_calc_part2, 1, 1),
                grid = (1, nDim))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        calc_time += t2-t1
    
        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp
        
    if return_times:
        return gpu_assignments, gpu_clusters.get(), \
                data_time, module_time, assign_time/iterations, calc_time/iterations 
    else:
        return gpu_clusters.get(), gpu_assignments.get()
Example #13
0
def MedianFilter(input=None, kernel_size=3, bw=32, bh=32):

	#s = cuda.Event()
	#e = cuda.Event()

	input_list = input

	BLOCK_WIDTH = bw
	BLOCK_HEIGHT = bh

	if isinstance(kernel_size, (int, long)):
		kernel_size = [kernel_size]*2

	WS_x, WS_y = kernel_size
	padding_y = WS_x/2
	padding_x = WS_y/2

	input_list = np.asarray(input_list)

	if input_list.ndim == 3:
		_, N, M = input_list.shape
	elif input_list.ndim == 2:
		N, M = input_list.shape
		input_list = [input_list]

	expanded_N = N + (2 * padding_y)
	expanded_M = M + (2 * padding_x)

	gridx = int(np.ceil((expanded_N)/BLOCK_WIDTH))+1
	gridy = int(np.ceil((expanded_M)/BLOCK_HEIGHT))+1
	grid = (gridx,gridy, 1)
	block = (BLOCK_WIDTH, BLOCK_HEIGHT, 1)

	code = """
		#pragma comment(linker, "/HEAP:4000000")


		/* Some sample C code for the quickselect algorithm, 
		   taken from Numerical Recipes in C. */

		#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

		__device__ float quickselect(float *arr, int n, int k) {
		  unsigned long i,ir,j,l,mid;
		  float a,temp;

		  l=0;
		  ir=n-1;
		  for(;;) {
		    if (ir <= l+1) { 
		      if (ir == l+1 && arr[ir] < arr[l]) {
			SWAP(arr[l],arr[ir]);
		      }
		      return arr[k];
		    }
		    else {
		      mid=(l+ir) >> 1; 
		      SWAP(arr[mid],arr[l+1]);
		      if (arr[l] > arr[ir]) {
			SWAP(arr[l],arr[ir]);
		      }
		      if (arr[l+1] > arr[ir]) {
			SWAP(arr[l+1],arr[ir]);
		      }
		      if (arr[l] > arr[l+1]) {
			SWAP(arr[l],arr[l+1]);
		      }
		      i=l+1; 
		      j=ir;
		      a=arr[l+1]; 
		      for (;;) { 
			do i++; while (arr[i] < a); 
			do j--; while (arr[j] > a); 
			if (j < i) break; 
			SWAP(arr[i],arr[j]);
		      } 
		      arr[l+1]=arr[j]; 
		      arr[j]=a;
		      if (j >= k) ir=j-1; 
		      if (j <= k) l=i;
		    }
		  }
		}

		/* https://softwareengineering.stackexchange.com/questions/284767/kth-selection-routine-floyd-algorithm-489
		 * Implementation from Stack Exchange user: Andy Dansby 
		 */

		__device__ float FloydWirth_kth(float arr[], const int kTHvalue) 
		{
		#define F_SWAP(a,b) { float temp=(a);(a)=(b);(b)=temp; }
		#define SIGNUM(x) ((x) < 0 ? -1 : ((x) > 0 ? 1 : (x)))

		    int left = 0;       
		    int right = %(WS^2)s - 1;     
		    int left2 = 0;
		    int right2 = %(WS^2)s - 1;

		    while (left < right) 
		    {           
		        if( arr[right2] < arr[left2] ) F_SWAP(arr[left2],arr[right2]);
		        if( arr[right2] < arr[kTHvalue] ) F_SWAP(arr[kTHvalue],arr[right2]);
		        if( arr[kTHvalue] < arr[left2] ) F_SWAP(arr[left2],arr[kTHvalue]);

		        int rightleft = right - left;

		        if (rightleft < kTHvalue)
		        {
		            int n = right - left + 1;
		            int ii = kTHvalue - left + 1;
		            int s = (n + n) / 3;
		            int sd = (n * s * (n - s) / n) * SIGNUM(ii - n / 2);
		            int left2 = max(left, kTHvalue - ii * s / n + sd);
		            int right2 = min(right, kTHvalue + (n - ii) * s / n + sd);              
		        }

		        float x=arr[kTHvalue];

		        while ((right2 > kTHvalue) && (left2 < kTHvalue))
		        {
		            do 
		            {
		                left2++;
		            }while (arr[left2] < x);

		            do
		            {
		                right2--;
		            }while (arr[right2] > x);

		            F_SWAP(arr[left2],arr[right2]);
		        }
		        left2++;
		        right2--;

		        if (right2 < kTHvalue) 
		        {
		            while (arr[left2]<x)
		            {
		                left2++;
		            }
		            left = left2;
		            right2 = right;
		        }

		        if (kTHvalue < left2) 
		        {
		            while (x < arr[right2])
		            {
		                right2--;
		            }

		            right = right2;
		            left2 = left;
		        }

		        if( arr[left] < arr[right] ) F_SWAP(arr[right],arr[left]);
		    }

		#undef F_SWAP
		#undef SIGNUM
		    return arr[kTHvalue];
		}



		texture<float, 2> tex;

		__global__ void mf(float* in, float* out, int imgDimY, int imgDimX)
		{

			float window[%(WS^2)s];

			int x_thread_offset = %(BY)s * blockIdx.x + threadIdx.x;
			int y_thread_offset = %(BX)s * blockIdx.y + threadIdx.y;
			for (int y = %(WSx/2)s + y_thread_offset; y < imgDimX - %(WSx/2)s; y += %(y_stride)s)
			{
				for (int x = %(WSy/2)s + x_thread_offset; x < imgDimY - %(WSy/2)s; x += %(x_stride)s)
				{
					int i = 0;
					for (int fx = 0; fx < %(WSy)s; ++fx)
					{
						for (int fy = 0; fy < %(WSx)s; ++fy)
						{
							//window[i] = tex2D(tex, (float) (x + fx - %(WSy/2)s), (float) (y + fy - %(WSx/2)s));
							window[i] = in[(x + fx - %(WSy/2)s) + (y + fy - %(WSx/2)s)*imgDimY];
							i += 1;
						}
					}

					// Sort to find the median
					//for (int j = 0; j < %(WS^2)s/2 + 1; j++)
					//{
					//	for (int k = j + 1; k < %(WS^2)s; k++)
					//	{
					//		if (window[j] > window[k])
					//		{
					//			float tmp = window[j];
					//			window[j] = window[k];
					//			window[k] = tmp;
					//		}
					//	}
					//}
					//out[y*imgDimY + x] = window[%(WS^2)s/2];
					out[y*imgDimY + x] = FloydWirth_kth(window, %(WS^2)s/2);
					//out[y*imgDimY + x] = quickselect(window, %(WS^2)s, %(WS^2)s/2);
				}
			}
		}


		__global__ void mf_shared(float *in, float* out, int imgDimY, int imgDimX)
		{			

			const int TSx = %(BX)s + %(WSx)s - 1;
			const int TSy = %(BY)s + %(WSy)s - 1;
            __shared__ float tile[TSx][TSy];

            float window[%(WS^2)s];
            const int x_thread_offset = %(BX)s * blockIdx.x + threadIdx.x;
            const int y_thread_offset = %(BY)s * blockIdx.y + threadIdx.y;


			const int thread_index = blockDim.y * threadIdx.x + threadIdx.y;

			int imgX = blockIdx.x * blockDim.x + thread_index;
			int imgY;

            // Load into the tile for this block
			if (thread_index < TSx && imgX < imgDimX)
			{
				for (int i = 0; i < TSy && i < imgDimY - blockIdx.y * blockDim.y; i++)
				{
					imgY = blockIdx.y * blockDim.y + i;
					tile[thread_index][i] = in[imgX * imgDimY + imgY];
					//tile[thread_index][i] = tex2D(tex, (float) imgY, (float) imgX);
				}

			}

			__syncthreads();


			int x = %(WSx/2)s + x_thread_offset;
			int y = %(WSy/2)s + y_thread_offset;

			if (x >= imgDimX - %(WSx/2)s || y >= imgDimY - %(WSy/2)s)
			{
				return;
			}

			int i = 0;
			for (int fx = 0; fx < %(WSx)s; ++fx)
			{
				for (int fy = 0; fy < %(WSy)s; ++fy)
				{
					window[i++] = tile[threadIdx.x + fx][threadIdx.y + fy];
				}
			}


			// Sort to find the median
			//for (int j = 0; j <= %(WS^2)s/2; j++)
			//{
			//	for (int k = j + 1; k < %(WS^2)s; k++)
			//	{
			//		if (window[j] > window[k])
			//		{
			//			float tmp = window[j];
			//			window[j] = window[k];
			//			window[k] = tmp;
			//		}
			//	}
			//}
			//out[x*imgDimY + y] = window[%(WS^2)s/2];

			out[x*imgDimY + y] = FloydWirth_kth(window, %(WS^2)s/2);

			//forgetfulSelection(window, %(WSx)s);
			//out[x*imgDimY + y] = window[%(WS^2)s/2];

			//out[x*imgDimY + y] = myForgetfulSelection(window);
		}

		"""

	code = code % {
			'BY' : BLOCK_WIDTH,
			'BX' : BLOCK_HEIGHT,
			'WS^2' : WS_x * WS_y,
			'x_stride' : BLOCK_WIDTH * gridx,
			'y_stride' : BLOCK_HEIGHT * gridy,
			'WSx' : WS_x,
			'WSy' : WS_y,
			'WSx/2' : WS_x/2,
			'WSy/2' : WS_y/2,
		}
	mod = SourceModule(code)
	#mf_shared = mod.get_function('mf_shared')
	mf = mod.get_function('mf')
	texref = mod.get_texref("tex")


	# NSTREAMS := NUMBER OF INPUT IMAGES
	nStreams = len(input_list)

	# Initialize the streams
	stream = [cuda.Stream()]*nStreams

	# Pad all the images with zeros
	input_list = [np.array( np.pad(img, ( (padding_y, padding_y), (padding_x, padding_x) ), 'constant', constant_values=0) , dtype=np.float32) for img in input_list]

	# Use pinned memory for all the images
	in_pin_list = [cuda.register_host_memory(img) for img in input_list]
	imgBytes = in_pin_list[0].nbytes

	# Initialize the outputs to empty images (assuming all images are of the same shape)
	outdata_list = [cuda.pagelocked_empty_like(img) for img in input_list]

	# Malloc on the GPU for each input and output image
	#in_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list]
	in_gpu_list = [None]*nStreams
	#out_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list]
	out_gpu_list = [None]*nStreams
	mf.prepare("PPii")
	for i in xrange(nStreams + 2):
		ii = i - 1
		iii = i - 2

		if 0 <= iii < nStreams:
			st = stream[iii]
			cuda.memcpy_dtoh_async(outdata_list[iii], out_gpu_list[iii], stream=st)

		if 0 <= ii < nStreams:
			st = stream[ii]
			out_gpu_list[ii] = cuda.mem_alloc(imgBytes)
			# s.record(stream=stream[0])
			# mf_shared.prepare("Pii")
			# mf_shared.prepared_async_call(grid, block, st, out_gpu_list[ii], expanded_M, expanded_N)

			#mf.prepare("PPii")
			mf.prepared_async_call(grid, block, st, in_gpu_list[ii], out_gpu_list[ii], expanded_M, expanded_N)
			# e.record(stream=stream[0])
			# e.synchronize()
			# print s.time_till(e), "ms for the kernel"

		if 0 <= i < nStreams:
			st = stream[i]
			#cuda.matrix_to_texref(in_pin_list[i], texref, order="C")
			in_gpu_list[i] = cuda.mem_alloc(imgBytes)
			cuda.memcpy_htod_async(in_gpu_list[i], in_pin_list[i], stream=st)

	if (padding_y > 0):
		outdata_list = [out[padding_y:-padding_y] for out in outdata_list]
	if (padding_x > 0):
		outdata_list = [out[:, padding_x:-padding_x] for out in outdata_list]

	return outdata_list
Example #14
0
def kmeans_gpu(data, clusters, iterations, return_times=0):
    # kmeans_gpu(data, clusters, iterations) returns (clusters, labels)

    # kmeans using standard algorithm and cuda
    # input arguments are the data, intial cluster values, and number of iterations to repeat
    # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
    # nPts = number of data points
    # The shape of clusters is (nDim, nClusters)
    #
    # The return values are the updated clusters and labels for the data

    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]

    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0
    usePageLockedMemory = 0
    if (nPts > 32768):
        useTextureForData = 0

    # block and grid sizes for the cluster_assign kernel
    threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16)
    blocksize_assign = min(256, threads_desired)
    gridsize_assign = 1 + (threads_desired - 1) / blocksize_assign
    """
    print "\nblocksize_assign =", blocksize_assign
    print "gridsize_assign  =", gridsize_assign
    """

    # block and grid sizes for the cluster_calc kernel
    blocksize_calc = 2
    while (blocksize_calc < min(512, nPts)):
        blocksize_calc *= 2
    maxblocks = 512
    seqcount_calc = 1 + (nPts - 1) / (blocksize_calc * maxblocks)
    gridsize_calc = 1 + (nPts - 1) / (seqcount_calc * blocksize_calc)
    """
    print "blocksize_calc =", blocksize_calc
    print "gridsize_calc  =", gridsize_calc
    print "seqcount_calc  =", seqcount_calc
    """

    blocksize_calc_part2 = 1
    while (blocksize_calc_part2 < gridsize_calc):
        blocksize_calc_part2 *= 2

    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()

    mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters, blocksize_calc,
                                       seqcount_calc, gridsize_calc,
                                       blocksize_calc_part2, useTextureForData,
                                       BOUNDS)

    cuda_assign = mod_cuda.get_function("assign")
    cuda_calc = mod_cuda.get_function("calc")
    cuda_calc_part2 = mod_cuda.get_function("calc_part2")

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2 - t1

    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)

    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_cuda.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:, :] = data
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:, :] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)

    gpu_assignments = gpuarray.zeros((nPts, ), np.int32)
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32)
    gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_calc),
                                       np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_calc, ),
                                          np.int32)

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2 - t1

    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    assign_time = 0.
    calc_time = 0.

    for i in range(iterations):

        # assign data to clusters
        t1 = time.time()
        if useTextureForData:
            cuda_assign(gpu_clusters,
                        gpu_assignments,
                        block=(blocksize_assign, 1, 1),
                        grid=(gridsize_assign, 1),
                        texrefs=[texrefData])
        else:
            cuda_assign(gpu_data,
                        gpu_clusters,
                        gpu_assignments,
                        block=(blocksize_assign, 1, 1),
                        grid=(gridsize_assign, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        assign_time += t2 - t1

        # calculate new cluster centers
        t1 = time.time()
        if useTextureForData:
            cuda_calc(gpu_reduction_out,
                      gpu_reduction_counts,
                      gpu_assignments,
                      block=(blocksize_calc, 1, 1),
                      grid=(gridsize_calc, nDim),
                      texrefs=[texrefData])
        else:
            cuda_calc(gpu_data,
                      gpu_reduction_out,
                      gpu_reduction_counts,
                      gpu_assignments,
                      block=(blocksize_calc, 1, 1),
                      grid=(gridsize_calc, nDim))

        cuda_calc_part2(gpu_reduction_out,
                        gpu_reduction_counts,
                        gpu_clusters2,
                        gpu_clusters,
                        block=(blocksize_calc_part2, 1, 1),
                        grid=(1, nDim))

        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        calc_time += t2 - t1

        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp

    if return_times:
        return gpu_assignments, gpu_clusters.get(), \
                data_time, module_time, assign_time/iterations, calc_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()